2016-09-08 Steven G. Kargl <kargl@gcc.gnu.org>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobe908c865fbd79a3fb5f24d7a861732dfdd4acf55
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* We do not have to consider dependences between accesses that belong
231 to the same group. */
232 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
233 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
234 return false;
236 /* Even if we have an anti-dependence then, as the vectorized loop covers at
237 least two scalar iterations, there is always also a true dependence.
238 As the vectorizer does not re-order loads and stores we can ignore
239 the anti-dependence if TBAA can disambiguate both DRs similar to the
240 case with known negative distance anti-dependences (positive
241 distance anti-dependences would violate TBAA constraints). */
242 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
243 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
244 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
245 get_alias_set (DR_REF (drb))))
246 return false;
248 /* Unknown data dependence. */
249 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
251 /* If user asserted safelen consecutive iterations can be
252 executed concurrently, assume independence. */
253 if (loop->safelen >= 2)
255 if (loop->safelen < *max_vf)
256 *max_vf = loop->safelen;
257 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
258 return false;
261 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
262 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
264 if (dump_enabled_p ())
266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
267 "versioning for alias not supported for: "
268 "can't determine dependence between ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (dra));
271 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
272 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
273 DR_REF (drb));
274 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
276 return true;
279 if (dump_enabled_p ())
281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
282 "versioning for alias required: "
283 "can't determine dependence between ");
284 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
285 DR_REF (dra));
286 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
287 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
288 DR_REF (drb));
289 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
292 /* Add to list of ddrs that need to be tested at run-time. */
293 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
296 /* Known data dependence. */
297 if (DDR_NUM_DIST_VECTS (ddr) == 0)
299 /* If user asserted safelen consecutive iterations can be
300 executed concurrently, assume independence. */
301 if (loop->safelen >= 2)
303 if (loop->safelen < *max_vf)
304 *max_vf = loop->safelen;
305 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
306 return false;
309 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
310 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
312 if (dump_enabled_p ())
314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
315 "versioning for alias not supported for: "
316 "bad dist vector for ");
317 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
318 DR_REF (dra));
319 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
320 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
321 DR_REF (drb));
322 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
324 return true;
327 if (dump_enabled_p ())
329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
330 "versioning for alias required: "
331 "bad dist vector for ");
332 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
333 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
335 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
341 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
344 int dist = dist_v[loop_depth];
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE, vect_location,
348 "dependence distance = %d.\n", dist);
350 if (dist == 0)
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE, vect_location,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_NOTE, " and ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
362 /* When we perform grouped accesses and perform implicit CSE
363 by detecting equal accesses and doing disambiguation with
364 runtime alias tests like for
365 .. = a[i];
366 .. = a[i+1];
367 a[i] = ..;
368 a[i+1] = ..;
369 *p = ..;
370 .. = a[i];
371 .. = a[i+1];
372 where we will end up loading { a[i], a[i+1] } once, make
373 sure that inserting group loads before the first load and
374 stores after the last store will do the right thing.
375 Similar for groups like
376 a[i] = ...;
377 ... = a[i];
378 a[i+1] = ...;
379 where loads from the group interleave with the store. */
380 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
381 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
383 gimple *earlier_stmt;
384 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
385 if (DR_IS_WRITE
386 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "READ_WRITE dependence in interleaving."
391 "\n");
392 return true;
396 continue;
399 if (dist > 0 && DDR_REVERSED_P (ddr))
401 /* If DDR_REVERSED_P the order of the data-refs in DDR was
402 reversed (to make distance vector positive), and the actual
403 distance is negative. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 "dependence distance negative.\n");
407 /* Record a negative dependence distance to later limit the
408 amount of stmt copying / unrolling we can perform.
409 Only need to handle read-after-write dependence. */
410 if (DR_IS_READ (drb)
411 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
412 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
413 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
414 continue;
417 if (abs (dist) >= 2
418 && abs (dist) < *max_vf)
420 /* The dependence distance requires reduction of the maximal
421 vectorization factor. */
422 *max_vf = abs (dist);
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_NOTE, vect_location,
425 "adjusting maximal vectorization factor to %i\n",
426 *max_vf);
429 if (abs (dist) >= *max_vf)
431 /* Dependence distance does not create dependence, as far as
432 vectorization is concerned, in this case. */
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_NOTE, vect_location,
435 "dependence distance >= VF.\n");
436 continue;
439 if (dump_enabled_p ())
441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
442 "not vectorized, possible dependence "
443 "between data-refs ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
445 dump_printf (MSG_NOTE, " and ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
447 dump_printf (MSG_NOTE, "\n");
450 return true;
453 return false;
456 /* Function vect_analyze_data_ref_dependences.
458 Examine all the data references in the loop, and make sure there do not
459 exist any data dependences between them. Set *MAX_VF according to
460 the maximum vectorization factor the data dependences allow. */
462 bool
463 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
465 unsigned int i;
466 struct data_dependence_relation *ddr;
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "=== vect_analyze_data_ref_dependences ===\n");
472 LOOP_VINFO_DDRS (loop_vinfo)
473 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
474 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
475 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
476 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
477 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
478 &LOOP_VINFO_DDRS (loop_vinfo),
479 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
480 return false;
482 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
483 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
484 return false;
486 return true;
490 /* Function vect_slp_analyze_data_ref_dependence.
492 Return TRUE if there (might) exist a dependence between a memory-reference
493 DRA and a memory-reference DRB. When versioning for alias may check a
494 dependence at run-time, return FALSE. Adjust *MAX_VF according to
495 the data dependence. */
497 static bool
498 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
500 struct data_reference *dra = DDR_A (ddr);
501 struct data_reference *drb = DDR_B (ddr);
503 /* We need to check dependences of statements marked as unvectorizable
504 as well, they still can prohibit vectorization. */
506 /* Independent data accesses. */
507 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
508 return false;
510 if (dra == drb)
511 return false;
513 /* Read-read is OK. */
514 if (DR_IS_READ (dra) && DR_IS_READ (drb))
515 return false;
517 /* If dra and drb are part of the same interleaving chain consider
518 them independent. */
519 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
520 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
521 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
522 return false;
524 /* Unknown data dependence. */
525 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "can't determine dependence between ");
531 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
532 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
533 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
534 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
537 else if (dump_enabled_p ())
539 dump_printf_loc (MSG_NOTE, vect_location,
540 "determined dependence between ");
541 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
542 dump_printf (MSG_NOTE, " and ");
543 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
544 dump_printf (MSG_NOTE, "\n");
547 return true;
551 /* Analyze dependences involved in the transform of SLP NODE. STORES
552 contain the vector of scalar stores of this instance if we are
553 disambiguating the loads. */
555 static bool
556 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
557 vec<gimple *> stores, gimple *last_store)
559 /* This walks over all stmts involved in the SLP load/store done
560 in NODE verifying we can sink them up to the last stmt in the
561 group. */
562 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
563 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
565 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
566 if (access == last_access)
567 continue;
568 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
569 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
570 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
572 gimple *stmt = gsi_stmt (gsi);
573 if (! gimple_vuse (stmt)
574 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
575 continue;
577 /* If we couldn't record a (single) data reference for this
578 stmt we have to give up. */
579 /* ??? Here and below if dependence analysis fails we can resort
580 to the alias oracle which can handle more kinds of stmts. */
581 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
582 if (!dr_b)
583 return false;
585 /* If we run into a store of this same instance (we've just
586 marked those) then delay dependence checking until we run
587 into the last store because this is where it will have
588 been sunk to (and we verify if we can do that as well). */
589 if (gimple_visited_p (stmt))
591 if (stmt != last_store)
592 continue;
593 unsigned i;
594 gimple *store;
595 FOR_EACH_VEC_ELT (stores, i, store)
597 data_reference *store_dr
598 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
599 ddr_p ddr = initialize_data_dependence_relation
600 (dr_a, store_dr, vNULL);
601 if (vect_slp_analyze_data_ref_dependence (ddr))
603 free_dependence_relation (ddr);
604 return false;
606 free_dependence_relation (ddr);
610 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
611 if (vect_slp_analyze_data_ref_dependence (ddr))
613 free_dependence_relation (ddr);
614 return false;
616 free_dependence_relation (ddr);
619 return true;
623 /* Function vect_analyze_data_ref_dependences.
625 Examine all the data references in the basic-block, and make sure there
626 do not exist any data dependences between them. Set *MAX_VF according to
627 the maximum vectorization factor the data dependences allow. */
629 bool
630 vect_slp_analyze_instance_dependence (slp_instance instance)
632 if (dump_enabled_p ())
633 dump_printf_loc (MSG_NOTE, vect_location,
634 "=== vect_slp_analyze_instance_dependence ===\n");
636 /* The stores of this instance are at the root of the SLP tree. */
637 slp_tree store = SLP_INSTANCE_TREE (instance);
638 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
639 store = NULL;
641 /* Verify we can sink stores to the vectorized stmt insert location. */
642 gimple *last_store = NULL;
643 if (store)
645 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
646 return false;
648 /* Mark stores in this instance and remember the last one. */
649 last_store = vect_find_last_scalar_stmt_in_slp (store);
650 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
651 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
654 bool res = true;
656 /* Verify we can sink loads to the vectorized stmt insert location,
657 special-casing stores of this instance. */
658 slp_tree load;
659 unsigned int i;
660 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
661 if (! vect_slp_analyze_node_dependences (instance, load,
662 store
663 ? SLP_TREE_SCALAR_STMTS (store)
664 : vNULL, last_store))
666 res = false;
667 break;
670 /* Unset the visited flag. */
671 if (store)
672 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
673 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
675 return res;
678 /* Function vect_compute_data_ref_alignment
680 Compute the misalignment of the data reference DR.
682 Output:
683 1. If during the misalignment computation it is found that the data reference
684 cannot be vectorized then false is returned.
685 2. DR_MISALIGNMENT (DR) is defined.
687 FOR NOW: No analysis is actually performed. Misalignment is calculated
688 only for trivial cases. TODO. */
690 bool
691 vect_compute_data_ref_alignment (struct data_reference *dr)
693 gimple *stmt = DR_STMT (dr);
694 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
695 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
696 struct loop *loop = NULL;
697 tree ref = DR_REF (dr);
698 tree vectype;
699 tree base, base_addr;
700 tree misalign = NULL_TREE;
701 tree aligned_to;
702 tree step;
703 unsigned HOST_WIDE_INT alignment;
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_NOTE, vect_location,
707 "vect_compute_data_ref_alignment:\n");
709 if (loop_vinfo)
710 loop = LOOP_VINFO_LOOP (loop_vinfo);
712 /* Initialize misalignment to unknown. */
713 SET_DR_MISALIGNMENT (dr, -1);
715 if (tree_fits_shwi_p (DR_STEP (dr)))
716 misalign = DR_INIT (dr);
717 aligned_to = DR_ALIGNED_TO (dr);
718 base_addr = DR_BASE_ADDRESS (dr);
719 vectype = STMT_VINFO_VECTYPE (stmt_info);
721 /* In case the dataref is in an inner-loop of the loop that is being
722 vectorized (LOOP), we use the base and misalignment information
723 relative to the outer-loop (LOOP). This is ok only if the misalignment
724 stays the same throughout the execution of the inner-loop, which is why
725 we have to check that the stride of the dataref in the inner-loop evenly
726 divides by the vector size. */
727 if (loop && nested_in_vect_loop_p (loop, stmt))
729 tree step = DR_STEP (dr);
731 if (tree_fits_shwi_p (step)
732 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_NOTE, vect_location,
736 "inner step divides the vector-size.\n");
737 misalign = STMT_VINFO_DR_INIT (stmt_info);
738 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
739 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
741 else
743 if (dump_enabled_p ())
744 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
745 "inner step doesn't divide the vector-size.\n");
746 misalign = NULL_TREE;
750 /* Similarly we can only use base and misalignment information relative to
751 an innermost loop if the misalignment stays the same throughout the
752 execution of the loop. As above, this is the case if the stride of
753 the dataref evenly divides by the vector size. */
754 else
756 tree step = DR_STEP (dr);
757 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
759 if (tree_fits_shwi_p (step)
760 && ((tree_to_shwi (step) * vf)
761 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 "step doesn't divide the vector-size.\n");
766 misalign = NULL_TREE;
770 /* To look at alignment of the base we have to preserve an inner MEM_REF
771 as that carries alignment information of the actual access. */
772 base = ref;
773 while (handled_component_p (base))
774 base = TREE_OPERAND (base, 0);
775 if (TREE_CODE (base) == MEM_REF)
776 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
777 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
778 unsigned int base_alignment = get_object_alignment (base);
780 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
781 DR_VECT_AUX (dr)->base_element_aligned = true;
783 alignment = TYPE_ALIGN_UNIT (vectype);
785 if ((compare_tree_int (aligned_to, alignment) < 0)
786 || !misalign)
788 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "Unknown alignment for access: ");
792 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
793 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
795 return true;
798 if (base_alignment < TYPE_ALIGN (vectype))
800 /* Strip an inner MEM_REF to a bare decl if possible. */
801 if (TREE_CODE (base) == MEM_REF
802 && integer_zerop (TREE_OPERAND (base, 1))
803 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
804 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
806 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
808 if (dump_enabled_p ())
810 dump_printf_loc (MSG_NOTE, vect_location,
811 "can't force alignment of ref: ");
812 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
813 dump_printf (MSG_NOTE, "\n");
815 return true;
818 /* Force the alignment of the decl.
819 NOTE: This is the only change to the code we make during
820 the analysis phase, before deciding to vectorize the loop. */
821 if (dump_enabled_p ())
823 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
824 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
825 dump_printf (MSG_NOTE, "\n");
828 DR_VECT_AUX (dr)->base_decl = base;
829 DR_VECT_AUX (dr)->base_misaligned = true;
830 DR_VECT_AUX (dr)->base_element_aligned = true;
833 if (loop && nested_in_vect_loop_p (loop, stmt))
834 step = STMT_VINFO_DR_STEP (stmt_info);
835 else
836 step = DR_STEP (dr);
837 /* If this is a backward running DR then first access in the larger
838 vectype actually is N-1 elements before the address in the DR.
839 Adjust misalign accordingly. */
840 if (tree_int_cst_sgn (step) < 0)
842 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
843 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
844 otherwise we wouldn't be here. */
845 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
846 /* PLUS because STEP was negative. */
847 misalign = size_binop (PLUS_EXPR, misalign, offset);
850 SET_DR_MISALIGNMENT (dr,
851 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
856 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
857 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
858 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
861 return true;
865 /* Function vect_update_misalignment_for_peel
867 DR - the data reference whose misalignment is to be adjusted.
868 DR_PEEL - the data reference whose misalignment is being made
869 zero in the vector loop by the peel.
870 NPEEL - the number of iterations in the peel loop if the misalignment
871 of DR_PEEL is known at compile time. */
873 static void
874 vect_update_misalignment_for_peel (struct data_reference *dr,
875 struct data_reference *dr_peel, int npeel)
877 unsigned int i;
878 vec<dr_p> same_align_drs;
879 struct data_reference *current_dr;
880 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
881 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
882 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
883 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
885 /* For interleaved data accesses the step in the loop must be multiplied by
886 the size of the interleaving group. */
887 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
888 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
889 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
890 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
892 /* It can be assumed that the data refs with the same alignment as dr_peel
893 are aligned in the vector loop. */
894 same_align_drs
895 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
896 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
898 if (current_dr != dr)
899 continue;
900 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
901 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
902 SET_DR_MISALIGNMENT (dr, 0);
903 return;
906 if (known_alignment_for_access_p (dr)
907 && known_alignment_for_access_p (dr_peel))
909 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
910 int misal = DR_MISALIGNMENT (dr);
911 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
912 misal += negative ? -npeel * dr_size : npeel * dr_size;
913 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
914 SET_DR_MISALIGNMENT (dr, misal);
915 return;
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
920 SET_DR_MISALIGNMENT (dr, -1);
924 /* Function verify_data_ref_alignment
926 Return TRUE if DR can be handled with respect to alignment. */
928 static bool
929 verify_data_ref_alignment (data_reference_p dr)
931 enum dr_alignment_support supportable_dr_alignment
932 = vect_supportable_dr_alignment (dr, false);
933 if (!supportable_dr_alignment)
935 if (dump_enabled_p ())
937 if (DR_IS_READ (dr))
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "not vectorized: unsupported unaligned load.");
940 else
941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
942 "not vectorized: unsupported unaligned "
943 "store.");
945 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
946 DR_REF (dr));
947 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
949 return false;
952 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
953 dump_printf_loc (MSG_NOTE, vect_location,
954 "Vectorizing an unaligned access.\n");
956 return true;
959 /* Function vect_verify_datarefs_alignment
961 Return TRUE if all data references in the loop can be
962 handled with respect to alignment. */
964 bool
965 vect_verify_datarefs_alignment (loop_vec_info vinfo)
967 vec<data_reference_p> datarefs = vinfo->datarefs;
968 struct data_reference *dr;
969 unsigned int i;
971 FOR_EACH_VEC_ELT (datarefs, i, dr)
973 gimple *stmt = DR_STMT (dr);
974 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
976 if (!STMT_VINFO_RELEVANT_P (stmt_info))
977 continue;
979 /* For interleaving, only the alignment of the first access matters. */
980 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
981 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
982 continue;
984 /* Strided accesses perform only component accesses, alignment is
985 irrelevant for them. */
986 if (STMT_VINFO_STRIDED_P (stmt_info)
987 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
988 continue;
990 if (! verify_data_ref_alignment (dr))
991 return false;
994 return true;
997 /* Given an memory reference EXP return whether its alignment is less
998 than its size. */
1000 static bool
1001 not_size_aligned (tree exp)
1003 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1004 return true;
1006 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1007 > get_object_alignment (exp));
1010 /* Function vector_alignment_reachable_p
1012 Return true if vector alignment for DR is reachable by peeling
1013 a few loop iterations. Return false otherwise. */
1015 static bool
1016 vector_alignment_reachable_p (struct data_reference *dr)
1018 gimple *stmt = DR_STMT (dr);
1019 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1020 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1022 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1024 /* For interleaved access we peel only if number of iterations in
1025 the prolog loop ({VF - misalignment}), is a multiple of the
1026 number of the interleaved accesses. */
1027 int elem_size, mis_in_elements;
1028 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1030 /* FORNOW: handle only known alignment. */
1031 if (!known_alignment_for_access_p (dr))
1032 return false;
1034 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1035 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1037 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1038 return false;
1041 /* If misalignment is known at the compile time then allow peeling
1042 only if natural alignment is reachable through peeling. */
1043 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1045 HOST_WIDE_INT elmsize =
1046 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1047 if (dump_enabled_p ())
1049 dump_printf_loc (MSG_NOTE, vect_location,
1050 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1051 dump_printf (MSG_NOTE,
1052 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1054 if (DR_MISALIGNMENT (dr) % elmsize)
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 "data size does not divide the misalignment.\n");
1059 return false;
1063 if (!known_alignment_for_access_p (dr))
1065 tree type = TREE_TYPE (DR_REF (dr));
1066 bool is_packed = not_size_aligned (DR_REF (dr));
1067 if (dump_enabled_p ())
1068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069 "Unknown misalignment, is_packed = %d\n",is_packed);
1070 if ((TYPE_USER_ALIGN (type) && !is_packed)
1071 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1072 return true;
1073 else
1074 return false;
1077 return true;
1081 /* Calculate the cost of the memory access represented by DR. */
1083 static void
1084 vect_get_data_access_cost (struct data_reference *dr,
1085 unsigned int *inside_cost,
1086 unsigned int *outside_cost,
1087 stmt_vector_for_cost *body_cost_vec)
1089 gimple *stmt = DR_STMT (dr);
1090 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1091 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1092 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1093 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1094 int ncopies = vf / nunits;
1096 if (DR_IS_READ (dr))
1097 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1098 NULL, body_cost_vec, false);
1099 else
1100 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1102 if (dump_enabled_p ())
1103 dump_printf_loc (MSG_NOTE, vect_location,
1104 "vect_get_data_access_cost: inside_cost = %d, "
1105 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1109 typedef struct _vect_peel_info
1111 int npeel;
1112 struct data_reference *dr;
1113 unsigned int count;
1114 } *vect_peel_info;
1116 typedef struct _vect_peel_extended_info
1118 struct _vect_peel_info peel_info;
1119 unsigned int inside_cost;
1120 unsigned int outside_cost;
1121 stmt_vector_for_cost body_cost_vec;
1122 } *vect_peel_extended_info;
1125 /* Peeling hashtable helpers. */
1127 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1129 static inline hashval_t hash (const _vect_peel_info *);
1130 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1133 inline hashval_t
1134 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1136 return (hashval_t) peel_info->npeel;
1139 inline bool
1140 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1142 return (a->npeel == b->npeel);
1146 /* Insert DR into peeling hash table with NPEEL as key. */
1148 static void
1149 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1150 loop_vec_info loop_vinfo, struct data_reference *dr,
1151 int npeel)
1153 struct _vect_peel_info elem, *slot;
1154 _vect_peel_info **new_slot;
1155 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1157 elem.npeel = npeel;
1158 slot = peeling_htab->find (&elem);
1159 if (slot)
1160 slot->count++;
1161 else
1163 slot = XNEW (struct _vect_peel_info);
1164 slot->npeel = npeel;
1165 slot->dr = dr;
1166 slot->count = 1;
1167 new_slot = peeling_htab->find_slot (slot, INSERT);
1168 *new_slot = slot;
1171 if (!supportable_dr_alignment
1172 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1173 slot->count += VECT_MAX_COST;
1177 /* Traverse peeling hash table to find peeling option that aligns maximum
1178 number of data accesses. */
1181 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1182 _vect_peel_extended_info *max)
1184 vect_peel_info elem = *slot;
1186 if (elem->count > max->peel_info.count
1187 || (elem->count == max->peel_info.count
1188 && max->peel_info.npeel > elem->npeel))
1190 max->peel_info.npeel = elem->npeel;
1191 max->peel_info.count = elem->count;
1192 max->peel_info.dr = elem->dr;
1195 return 1;
1199 /* Traverse peeling hash table and calculate cost for each peeling option.
1200 Find the one with the lowest cost. */
1203 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1204 _vect_peel_extended_info *min)
1206 vect_peel_info elem = *slot;
1207 int save_misalignment, dummy;
1208 unsigned int inside_cost = 0, outside_cost = 0, i;
1209 gimple *stmt = DR_STMT (elem->dr);
1210 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1212 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1213 struct data_reference *dr;
1214 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1216 prologue_cost_vec.create (2);
1217 body_cost_vec.create (2);
1218 epilogue_cost_vec.create (2);
1220 FOR_EACH_VEC_ELT (datarefs, i, dr)
1222 stmt = DR_STMT (dr);
1223 stmt_info = vinfo_for_stmt (stmt);
1224 /* For interleaving, only the alignment of the first access
1225 matters. */
1226 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1227 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1228 continue;
1230 /* Strided accesses perform only component accesses, alignment is
1231 irrelevant for them. */
1232 if (STMT_VINFO_STRIDED_P (stmt_info)
1233 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1234 continue;
1236 save_misalignment = DR_MISALIGNMENT (dr);
1237 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1238 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1239 &body_cost_vec);
1240 SET_DR_MISALIGNMENT (dr, save_misalignment);
1243 outside_cost += vect_get_known_peeling_cost
1244 (loop_vinfo, elem->npeel, &dummy,
1245 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1246 &prologue_cost_vec, &epilogue_cost_vec);
1248 /* Prologue and epilogue costs are added to the target model later.
1249 These costs depend only on the scalar iteration cost, the
1250 number of peeling iterations finally chosen, and the number of
1251 misaligned statements. So discard the information found here. */
1252 prologue_cost_vec.release ();
1253 epilogue_cost_vec.release ();
1255 if (inside_cost < min->inside_cost
1256 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1258 min->inside_cost = inside_cost;
1259 min->outside_cost = outside_cost;
1260 min->body_cost_vec.release ();
1261 min->body_cost_vec = body_cost_vec;
1262 min->peel_info.dr = elem->dr;
1263 min->peel_info.npeel = elem->npeel;
1265 else
1266 body_cost_vec.release ();
1268 return 1;
1272 /* Choose best peeling option by traversing peeling hash table and either
1273 choosing an option with the lowest cost (if cost model is enabled) or the
1274 option that aligns as many accesses as possible. */
1276 static struct data_reference *
1277 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1278 loop_vec_info loop_vinfo,
1279 unsigned int *npeel,
1280 stmt_vector_for_cost *body_cost_vec)
1282 struct _vect_peel_extended_info res;
1284 res.peel_info.dr = NULL;
1285 res.body_cost_vec = stmt_vector_for_cost ();
1287 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1289 res.inside_cost = INT_MAX;
1290 res.outside_cost = INT_MAX;
1291 peeling_htab->traverse <_vect_peel_extended_info *,
1292 vect_peeling_hash_get_lowest_cost> (&res);
1294 else
1296 res.peel_info.count = 0;
1297 peeling_htab->traverse <_vect_peel_extended_info *,
1298 vect_peeling_hash_get_most_frequent> (&res);
1301 *npeel = res.peel_info.npeel;
1302 *body_cost_vec = res.body_cost_vec;
1303 return res.peel_info.dr;
1307 /* Function vect_enhance_data_refs_alignment
1309 This pass will use loop versioning and loop peeling in order to enhance
1310 the alignment of data references in the loop.
1312 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1313 original loop is to be vectorized. Any other loops that are created by
1314 the transformations performed in this pass - are not supposed to be
1315 vectorized. This restriction will be relaxed.
1317 This pass will require a cost model to guide it whether to apply peeling
1318 or versioning or a combination of the two. For example, the scheme that
1319 intel uses when given a loop with several memory accesses, is as follows:
1320 choose one memory access ('p') which alignment you want to force by doing
1321 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1322 other accesses are not necessarily aligned, or (2) use loop versioning to
1323 generate one loop in which all accesses are aligned, and another loop in
1324 which only 'p' is necessarily aligned.
1326 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1327 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1328 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1330 Devising a cost model is the most critical aspect of this work. It will
1331 guide us on which access to peel for, whether to use loop versioning, how
1332 many versions to create, etc. The cost model will probably consist of
1333 generic considerations as well as target specific considerations (on
1334 powerpc for example, misaligned stores are more painful than misaligned
1335 loads).
1337 Here are the general steps involved in alignment enhancements:
1339 -- original loop, before alignment analysis:
1340 for (i=0; i<N; i++){
1341 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1342 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1345 -- After vect_compute_data_refs_alignment:
1346 for (i=0; i<N; i++){
1347 x = q[i]; # DR_MISALIGNMENT(q) = 3
1348 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1351 -- Possibility 1: we do loop versioning:
1352 if (p is aligned) {
1353 for (i=0; i<N; i++){ # loop 1A
1354 x = q[i]; # DR_MISALIGNMENT(q) = 3
1355 p[i] = y; # DR_MISALIGNMENT(p) = 0
1358 else {
1359 for (i=0; i<N; i++){ # loop 1B
1360 x = q[i]; # DR_MISALIGNMENT(q) = 3
1361 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1365 -- Possibility 2: we do loop peeling:
1366 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1367 x = q[i];
1368 p[i] = y;
1370 for (i = 3; i < N; i++){ # loop 2A
1371 x = q[i]; # DR_MISALIGNMENT(q) = 0
1372 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1375 -- Possibility 3: combination of loop peeling and versioning:
1376 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1377 x = q[i];
1378 p[i] = y;
1380 if (p is aligned) {
1381 for (i = 3; i<N; i++){ # loop 3A
1382 x = q[i]; # DR_MISALIGNMENT(q) = 0
1383 p[i] = y; # DR_MISALIGNMENT(p) = 0
1386 else {
1387 for (i = 3; i<N; i++){ # loop 3B
1388 x = q[i]; # DR_MISALIGNMENT(q) = 0
1389 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1393 These loops are later passed to loop_transform to be vectorized. The
1394 vectorizer will use the alignment information to guide the transformation
1395 (whether to generate regular loads/stores, or with special handling for
1396 misalignment). */
1398 bool
1399 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1401 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1403 enum dr_alignment_support supportable_dr_alignment;
1404 struct data_reference *dr0 = NULL, *first_store = NULL;
1405 struct data_reference *dr;
1406 unsigned int i, j;
1407 bool do_peeling = false;
1408 bool do_versioning = false;
1409 bool stat;
1410 gimple *stmt;
1411 stmt_vec_info stmt_info;
1412 unsigned int npeel = 0;
1413 bool all_misalignments_unknown = true;
1414 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1415 unsigned possible_npeel_number = 1;
1416 tree vectype;
1417 unsigned int nelements, mis, same_align_drs_max = 0;
1418 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1419 hash_table<peel_info_hasher> peeling_htab (1);
1421 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_NOTE, vect_location,
1423 "=== vect_enhance_data_refs_alignment ===\n");
1425 /* Reset data so we can safely be called multiple times. */
1426 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1427 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1429 /* While cost model enhancements are expected in the future, the high level
1430 view of the code at this time is as follows:
1432 A) If there is a misaligned access then see if peeling to align
1433 this access can make all data references satisfy
1434 vect_supportable_dr_alignment. If so, update data structures
1435 as needed and return true.
1437 B) If peeling wasn't possible and there is a data reference with an
1438 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1439 then see if loop versioning checks can be used to make all data
1440 references satisfy vect_supportable_dr_alignment. If so, update
1441 data structures as needed and return true.
1443 C) If neither peeling nor versioning were successful then return false if
1444 any data reference does not satisfy vect_supportable_dr_alignment.
1446 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1448 Note, Possibility 3 above (which is peeling and versioning together) is not
1449 being done at this time. */
1451 /* (1) Peeling to force alignment. */
1453 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1454 Considerations:
1455 + How many accesses will become aligned due to the peeling
1456 - How many accesses will become unaligned due to the peeling,
1457 and the cost of misaligned accesses.
1458 - The cost of peeling (the extra runtime checks, the increase
1459 in code size). */
1461 FOR_EACH_VEC_ELT (datarefs, i, dr)
1463 stmt = DR_STMT (dr);
1464 stmt_info = vinfo_for_stmt (stmt);
1466 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1467 continue;
1469 /* For interleaving, only the alignment of the first access
1470 matters. */
1471 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1472 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1473 continue;
1475 /* For invariant accesses there is nothing to enhance. */
1476 if (integer_zerop (DR_STEP (dr)))
1477 continue;
1479 /* Strided accesses perform only component accesses, alignment is
1480 irrelevant for them. */
1481 if (STMT_VINFO_STRIDED_P (stmt_info)
1482 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1483 continue;
1485 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1486 do_peeling = vector_alignment_reachable_p (dr);
1487 if (do_peeling)
1489 if (known_alignment_for_access_p (dr))
1491 unsigned int npeel_tmp;
1492 bool negative = tree_int_cst_compare (DR_STEP (dr),
1493 size_zero_node) < 0;
1495 /* Save info about DR in the hash table. */
1496 vectype = STMT_VINFO_VECTYPE (stmt_info);
1497 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1498 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1499 TREE_TYPE (DR_REF (dr))));
1500 npeel_tmp = (negative
1501 ? (mis - nelements) : (nelements - mis))
1502 & (nelements - 1);
1504 /* For multiple types, it is possible that the bigger type access
1505 will have more than one peeling option. E.g., a loop with two
1506 types: one of size (vector size / 4), and the other one of
1507 size (vector size / 8). Vectorization factor will 8. If both
1508 access are misaligned by 3, the first one needs one scalar
1509 iteration to be aligned, and the second one needs 5. But the
1510 first one will be aligned also by peeling 5 scalar
1511 iterations, and in that case both accesses will be aligned.
1512 Hence, except for the immediate peeling amount, we also want
1513 to try to add full vector size, while we don't exceed
1514 vectorization factor.
1515 We do this automtically for cost model, since we calculate cost
1516 for every peeling option. */
1517 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1519 if (STMT_SLP_TYPE (stmt_info))
1520 possible_npeel_number
1521 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1522 else
1523 possible_npeel_number = vf / nelements;
1526 /* Handle the aligned case. We may decide to align some other
1527 access, making DR unaligned. */
1528 if (DR_MISALIGNMENT (dr) == 0)
1530 npeel_tmp = 0;
1531 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1532 possible_npeel_number++;
1535 for (j = 0; j < possible_npeel_number; j++)
1537 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1538 dr, npeel_tmp);
1539 npeel_tmp += nelements;
1542 all_misalignments_unknown = false;
1543 /* Data-ref that was chosen for the case that all the
1544 misalignments are unknown is not relevant anymore, since we
1545 have a data-ref with known alignment. */
1546 dr0 = NULL;
1548 else
1550 /* If we don't know any misalignment values, we prefer
1551 peeling for data-ref that has the maximum number of data-refs
1552 with the same alignment, unless the target prefers to align
1553 stores over load. */
1554 if (all_misalignments_unknown)
1556 unsigned same_align_drs
1557 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1558 if (!dr0
1559 || same_align_drs_max < same_align_drs)
1561 same_align_drs_max = same_align_drs;
1562 dr0 = dr;
1564 /* For data-refs with the same number of related
1565 accesses prefer the one where the misalign
1566 computation will be invariant in the outermost loop. */
1567 else if (same_align_drs_max == same_align_drs)
1569 struct loop *ivloop0, *ivloop;
1570 ivloop0 = outermost_invariant_loop_for_expr
1571 (loop, DR_BASE_ADDRESS (dr0));
1572 ivloop = outermost_invariant_loop_for_expr
1573 (loop, DR_BASE_ADDRESS (dr));
1574 if ((ivloop && !ivloop0)
1575 || (ivloop && ivloop0
1576 && flow_loop_nested_p (ivloop, ivloop0)))
1577 dr0 = dr;
1580 if (!first_store && DR_IS_WRITE (dr))
1581 first_store = dr;
1584 /* If there are both known and unknown misaligned accesses in the
1585 loop, we choose peeling amount according to the known
1586 accesses. */
1587 if (!supportable_dr_alignment)
1589 dr0 = dr;
1590 if (!first_store && DR_IS_WRITE (dr))
1591 first_store = dr;
1595 else
1597 if (!aligned_access_p (dr))
1599 if (dump_enabled_p ())
1600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601 "vector alignment may not be reachable\n");
1602 break;
1607 /* Check if we can possibly peel the loop. */
1608 if (!vect_can_advance_ivs_p (loop_vinfo)
1609 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1610 || loop->inner)
1611 do_peeling = false;
1613 if (do_peeling
1614 && all_misalignments_unknown
1615 && vect_supportable_dr_alignment (dr0, false))
1617 /* Check if the target requires to prefer stores over loads, i.e., if
1618 misaligned stores are more expensive than misaligned loads (taking
1619 drs with same alignment into account). */
1620 if (first_store && DR_IS_READ (dr0))
1622 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1623 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1624 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1625 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1626 stmt_vector_for_cost dummy;
1627 dummy.create (2);
1629 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1630 &dummy);
1631 vect_get_data_access_cost (first_store, &store_inside_cost,
1632 &store_outside_cost, &dummy);
1634 dummy.release ();
1636 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1637 aligning the load DR0). */
1638 load_inside_penalty = store_inside_cost;
1639 load_outside_penalty = store_outside_cost;
1640 for (i = 0;
1641 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1642 DR_STMT (first_store))).iterate (i, &dr);
1643 i++)
1644 if (DR_IS_READ (dr))
1646 load_inside_penalty += load_inside_cost;
1647 load_outside_penalty += load_outside_cost;
1649 else
1651 load_inside_penalty += store_inside_cost;
1652 load_outside_penalty += store_outside_cost;
1655 /* Calculate the penalty for leaving DR0 unaligned (by
1656 aligning the FIRST_STORE). */
1657 store_inside_penalty = load_inside_cost;
1658 store_outside_penalty = load_outside_cost;
1659 for (i = 0;
1660 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1661 DR_STMT (dr0))).iterate (i, &dr);
1662 i++)
1663 if (DR_IS_READ (dr))
1665 store_inside_penalty += load_inside_cost;
1666 store_outside_penalty += load_outside_cost;
1668 else
1670 store_inside_penalty += store_inside_cost;
1671 store_outside_penalty += store_outside_cost;
1674 if (load_inside_penalty > store_inside_penalty
1675 || (load_inside_penalty == store_inside_penalty
1676 && load_outside_penalty > store_outside_penalty))
1677 dr0 = first_store;
1680 /* In case there are only loads with different unknown misalignments, use
1681 peeling only if it may help to align other accesses in the loop or
1682 if it may help improving load bandwith when we'd end up using
1683 unaligned loads. */
1684 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1685 if (!first_store
1686 && !STMT_VINFO_SAME_ALIGN_REFS (
1687 vinfo_for_stmt (DR_STMT (dr0))).length ()
1688 && (vect_supportable_dr_alignment (dr0, false)
1689 != dr_unaligned_supported
1690 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1691 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1692 do_peeling = false;
1695 if (do_peeling && !dr0)
1697 /* Peeling is possible, but there is no data access that is not supported
1698 unless aligned. So we try to choose the best possible peeling. */
1700 /* We should get here only if there are drs with known misalignment. */
1701 gcc_assert (!all_misalignments_unknown);
1703 /* Choose the best peeling from the hash table. */
1704 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1705 loop_vinfo, &npeel,
1706 &body_cost_vec);
1707 if (!dr0 || !npeel)
1708 do_peeling = false;
1711 if (do_peeling)
1713 stmt = DR_STMT (dr0);
1714 stmt_info = vinfo_for_stmt (stmt);
1715 vectype = STMT_VINFO_VECTYPE (stmt_info);
1716 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1718 if (known_alignment_for_access_p (dr0))
1720 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1721 size_zero_node) < 0;
1722 if (!npeel)
1724 /* Since it's known at compile time, compute the number of
1725 iterations in the peeled loop (the peeling factor) for use in
1726 updating DR_MISALIGNMENT values. The peeling factor is the
1727 vectorization factor minus the misalignment as an element
1728 count. */
1729 mis = DR_MISALIGNMENT (dr0);
1730 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1731 npeel = ((negative ? mis - nelements : nelements - mis)
1732 & (nelements - 1));
1735 /* For interleaved data access every iteration accesses all the
1736 members of the group, therefore we divide the number of iterations
1737 by the group size. */
1738 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1739 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1740 npeel /= GROUP_SIZE (stmt_info);
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_NOTE, vect_location,
1744 "Try peeling by %d\n", npeel);
1747 /* Ensure that all data refs can be vectorized after the peel. */
1748 FOR_EACH_VEC_ELT (datarefs, i, dr)
1750 int save_misalignment;
1752 if (dr == dr0)
1753 continue;
1755 stmt = DR_STMT (dr);
1756 stmt_info = vinfo_for_stmt (stmt);
1757 /* For interleaving, only the alignment of the first access
1758 matters. */
1759 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1760 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1761 continue;
1763 /* Strided accesses perform only component accesses, alignment is
1764 irrelevant for them. */
1765 if (STMT_VINFO_STRIDED_P (stmt_info)
1766 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1767 continue;
1769 save_misalignment = DR_MISALIGNMENT (dr);
1770 vect_update_misalignment_for_peel (dr, dr0, npeel);
1771 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1772 SET_DR_MISALIGNMENT (dr, save_misalignment);
1774 if (!supportable_dr_alignment)
1776 do_peeling = false;
1777 break;
1781 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1783 stat = vect_verify_datarefs_alignment (loop_vinfo);
1784 if (!stat)
1785 do_peeling = false;
1786 else
1788 body_cost_vec.release ();
1789 return stat;
1793 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1794 if (do_peeling)
1796 unsigned max_allowed_peel
1797 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1798 if (max_allowed_peel != (unsigned)-1)
1800 unsigned max_peel = npeel;
1801 if (max_peel == 0)
1803 gimple *dr_stmt = DR_STMT (dr0);
1804 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1805 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1806 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1808 if (max_peel > max_allowed_peel)
1810 do_peeling = false;
1811 if (dump_enabled_p ())
1812 dump_printf_loc (MSG_NOTE, vect_location,
1813 "Disable peeling, max peels reached: %d\n", max_peel);
1818 /* Cost model #2 - if peeling may result in a remaining loop not
1819 iterating enough to be vectorized then do not peel. */
1820 if (do_peeling
1821 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1823 unsigned max_peel
1824 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1825 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1826 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1827 do_peeling = false;
1830 if (do_peeling)
1832 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1833 If the misalignment of DR_i is identical to that of dr0 then set
1834 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1835 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1836 by the peeling factor times the element size of DR_i (MOD the
1837 vectorization factor times the size). Otherwise, the
1838 misalignment of DR_i must be set to unknown. */
1839 FOR_EACH_VEC_ELT (datarefs, i, dr)
1840 if (dr != dr0)
1842 /* Strided accesses perform only component accesses, alignment
1843 is irrelevant for them. */
1844 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1845 if (STMT_VINFO_STRIDED_P (stmt_info)
1846 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1847 continue;
1849 vect_update_misalignment_for_peel (dr, dr0, npeel);
1852 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1853 if (npeel)
1854 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1855 else
1856 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857 = DR_MISALIGNMENT (dr0);
1858 SET_DR_MISALIGNMENT (dr0, 0);
1859 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_NOTE, vect_location,
1862 "Alignment of access forced using peeling.\n");
1863 dump_printf_loc (MSG_NOTE, vect_location,
1864 "Peeling for alignment will be applied.\n");
1866 /* The inside-loop cost will be accounted for in vectorizable_load
1867 and vectorizable_store correctly with adjusted alignments.
1868 Drop the body_cst_vec on the floor here. */
1869 body_cost_vec.release ();
1871 stat = vect_verify_datarefs_alignment (loop_vinfo);
1872 gcc_assert (stat);
1873 return stat;
1877 body_cost_vec.release ();
1879 /* (2) Versioning to force alignment. */
1881 /* Try versioning if:
1882 1) optimize loop for speed
1883 2) there is at least one unsupported misaligned data ref with an unknown
1884 misalignment, and
1885 3) all misaligned data refs with a known misalignment are supported, and
1886 4) the number of runtime alignment checks is within reason. */
1888 do_versioning =
1889 optimize_loop_nest_for_speed_p (loop)
1890 && (!loop->inner); /* FORNOW */
1892 if (do_versioning)
1894 FOR_EACH_VEC_ELT (datarefs, i, dr)
1896 stmt = DR_STMT (dr);
1897 stmt_info = vinfo_for_stmt (stmt);
1899 /* For interleaving, only the alignment of the first access
1900 matters. */
1901 if (aligned_access_p (dr)
1902 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1903 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1904 continue;
1906 if (STMT_VINFO_STRIDED_P (stmt_info))
1908 /* Strided loads perform only component accesses, alignment is
1909 irrelevant for them. */
1910 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1911 continue;
1912 do_versioning = false;
1913 break;
1916 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1918 if (!supportable_dr_alignment)
1920 gimple *stmt;
1921 int mask;
1922 tree vectype;
1924 if (known_alignment_for_access_p (dr)
1925 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1926 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1928 do_versioning = false;
1929 break;
1932 stmt = DR_STMT (dr);
1933 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1934 gcc_assert (vectype);
1936 /* The rightmost bits of an aligned address must be zeros.
1937 Construct the mask needed for this test. For example,
1938 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1939 mask must be 15 = 0xf. */
1940 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1942 /* FORNOW: use the same mask to test all potentially unaligned
1943 references in the loop. The vectorizer currently supports
1944 a single vector size, see the reference to
1945 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1946 vectorization factor is computed. */
1947 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1948 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1949 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1950 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1951 DR_STMT (dr));
1955 /* Versioning requires at least one misaligned data reference. */
1956 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1957 do_versioning = false;
1958 else if (!do_versioning)
1959 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1962 if (do_versioning)
1964 vec<gimple *> may_misalign_stmts
1965 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1966 gimple *stmt;
1968 /* It can now be assumed that the data references in the statements
1969 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1970 of the loop being vectorized. */
1971 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1974 dr = STMT_VINFO_DATA_REF (stmt_info);
1975 SET_DR_MISALIGNMENT (dr, 0);
1976 if (dump_enabled_p ())
1977 dump_printf_loc (MSG_NOTE, vect_location,
1978 "Alignment of access forced using versioning.\n");
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_NOTE, vect_location,
1983 "Versioning for alignment will be applied.\n");
1985 /* Peeling and versioning can't be done together at this time. */
1986 gcc_assert (! (do_peeling && do_versioning));
1988 stat = vect_verify_datarefs_alignment (loop_vinfo);
1989 gcc_assert (stat);
1990 return stat;
1993 /* This point is reached if neither peeling nor versioning is being done. */
1994 gcc_assert (! (do_peeling || do_versioning));
1996 stat = vect_verify_datarefs_alignment (loop_vinfo);
1997 return stat;
2001 /* Function vect_find_same_alignment_drs.
2003 Update group and alignment relations according to the chosen
2004 vectorization factor. */
2006 static void
2007 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2008 loop_vec_info loop_vinfo)
2010 unsigned int i;
2011 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2012 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2013 struct data_reference *dra = DDR_A (ddr);
2014 struct data_reference *drb = DDR_B (ddr);
2015 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2016 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2017 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2018 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2019 lambda_vector dist_v;
2020 unsigned int loop_depth;
2022 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2023 return;
2025 if (dra == drb)
2026 return;
2028 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2029 return;
2031 /* Loop-based vectorization and known data dependence. */
2032 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2033 return;
2035 /* Data-dependence analysis reports a distance vector of zero
2036 for data-references that overlap only in the first iteration
2037 but have different sign step (see PR45764).
2038 So as a sanity check require equal DR_STEP. */
2039 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2040 return;
2042 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2043 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2045 int dist = dist_v[loop_depth];
2047 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "dependence distance = %d.\n", dist);
2051 /* Same loop iteration. */
2052 if (dist == 0
2053 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2055 /* Two references with distance zero have the same alignment. */
2056 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2057 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2058 if (dump_enabled_p ())
2060 dump_printf_loc (MSG_NOTE, vect_location,
2061 "accesses have the same alignment.\n");
2062 dump_printf (MSG_NOTE,
2063 "dependence distance modulo vf == 0 between ");
2064 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2065 dump_printf (MSG_NOTE, " and ");
2066 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2067 dump_printf (MSG_NOTE, "\n");
2074 /* Function vect_analyze_data_refs_alignment
2076 Analyze the alignment of the data-references in the loop.
2077 Return FALSE if a data reference is found that cannot be vectorized. */
2079 bool
2080 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2082 if (dump_enabled_p ())
2083 dump_printf_loc (MSG_NOTE, vect_location,
2084 "=== vect_analyze_data_refs_alignment ===\n");
2086 /* Mark groups of data references with same alignment using
2087 data dependence information. */
2088 vec<ddr_p> ddrs = vinfo->ddrs;
2089 struct data_dependence_relation *ddr;
2090 unsigned int i;
2092 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2093 vect_find_same_alignment_drs (ddr, vinfo);
2095 vec<data_reference_p> datarefs = vinfo->datarefs;
2096 struct data_reference *dr;
2098 FOR_EACH_VEC_ELT (datarefs, i, dr)
2100 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2101 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2102 && !vect_compute_data_ref_alignment (dr))
2104 /* Strided accesses perform only component accesses, misalignment
2105 information is irrelevant for them. */
2106 if (STMT_VINFO_STRIDED_P (stmt_info)
2107 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2108 continue;
2110 if (dump_enabled_p ())
2111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2112 "not vectorized: can't calculate alignment "
2113 "for data ref.\n");
2115 return false;
2119 return true;
2123 /* Analyze alignment of DRs of stmts in NODE. */
2125 static bool
2126 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2128 /* We vectorize from the first scalar stmt in the node unless
2129 the node is permuted in which case we start from the first
2130 element in the group. */
2131 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2132 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2133 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2134 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2136 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2137 if (! vect_compute_data_ref_alignment (dr)
2138 /* For creating the data-ref pointer we need alignment of the
2139 first element anyway. */
2140 || (dr != first_dr
2141 && ! vect_compute_data_ref_alignment (first_dr))
2142 || ! verify_data_ref_alignment (dr))
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2146 "not vectorized: bad data alignment in basic "
2147 "block.\n");
2148 return false;
2151 return true;
2154 /* Function vect_slp_analyze_instance_alignment
2156 Analyze the alignment of the data-references in the SLP instance.
2157 Return FALSE if a data reference is found that cannot be vectorized. */
2159 bool
2160 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2162 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_NOTE, vect_location,
2164 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2166 slp_tree node;
2167 unsigned i;
2168 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2169 if (! vect_slp_analyze_and_verify_node_alignment (node))
2170 return false;
2172 node = SLP_INSTANCE_TREE (instance);
2173 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2174 && ! vect_slp_analyze_and_verify_node_alignment
2175 (SLP_INSTANCE_TREE (instance)))
2176 return false;
2178 return true;
2182 /* Analyze groups of accesses: check that DR belongs to a group of
2183 accesses of legal size, step, etc. Detect gaps, single element
2184 interleaving, and other special cases. Set grouped access info.
2185 Collect groups of strided stores for further use in SLP analysis.
2186 Worker for vect_analyze_group_access. */
2188 static bool
2189 vect_analyze_group_access_1 (struct data_reference *dr)
2191 tree step = DR_STEP (dr);
2192 tree scalar_type = TREE_TYPE (DR_REF (dr));
2193 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2194 gimple *stmt = DR_STMT (dr);
2195 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2196 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2197 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2198 HOST_WIDE_INT dr_step = -1;
2199 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2200 bool slp_impossible = false;
2202 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2203 size of the interleaving group (including gaps). */
2204 if (tree_fits_shwi_p (step))
2206 dr_step = tree_to_shwi (step);
2207 /* Check that STEP is a multiple of type size. Otherwise there is
2208 a non-element-sized gap at the end of the group which we
2209 cannot represent in GROUP_GAP or GROUP_SIZE.
2210 ??? As we can handle non-constant step fine here we should
2211 simply remove uses of GROUP_GAP between the last and first
2212 element and instead rely on DR_STEP. GROUP_SIZE then would
2213 simply not include that gap. */
2214 if ((dr_step % type_size) != 0)
2216 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_NOTE, vect_location,
2219 "Step ");
2220 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2221 dump_printf (MSG_NOTE,
2222 " is not a multiple of the element size for ");
2223 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2224 dump_printf (MSG_NOTE, "\n");
2226 return false;
2228 groupsize = absu_hwi (dr_step) / type_size;
2230 else
2231 groupsize = 0;
2233 /* Not consecutive access is possible only if it is a part of interleaving. */
2234 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2236 /* Check if it this DR is a part of interleaving, and is a single
2237 element of the group that is accessed in the loop. */
2239 /* Gaps are supported only for loads. STEP must be a multiple of the type
2240 size. The size of the group must be a power of 2. */
2241 if (DR_IS_READ (dr)
2242 && (dr_step % type_size) == 0
2243 && groupsize > 0
2244 && exact_log2 (groupsize) != -1)
2246 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2247 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2248 GROUP_GAP (stmt_info) = groupsize - 1;
2249 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_NOTE, vect_location,
2252 "Detected single element interleaving ");
2253 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2254 dump_printf (MSG_NOTE, " step ");
2255 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2256 dump_printf (MSG_NOTE, "\n");
2259 return true;
2262 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2265 "not consecutive access ");
2266 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2269 if (bb_vinfo)
2271 /* Mark the statement as unvectorizable. */
2272 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2273 return true;
2276 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2277 STMT_VINFO_STRIDED_P (stmt_info) = true;
2278 return true;
2281 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2283 /* First stmt in the interleaving chain. Check the chain. */
2284 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2285 struct data_reference *data_ref = dr;
2286 unsigned int count = 1;
2287 tree prev_init = DR_INIT (data_ref);
2288 gimple *prev = stmt;
2289 HOST_WIDE_INT diff, gaps = 0;
2291 while (next)
2293 /* Skip same data-refs. In case that two or more stmts share
2294 data-ref (supported only for loads), we vectorize only the first
2295 stmt, and the rest get their vectorized loads from the first
2296 one. */
2297 if (!tree_int_cst_compare (DR_INIT (data_ref),
2298 DR_INIT (STMT_VINFO_DATA_REF (
2299 vinfo_for_stmt (next)))))
2301 if (DR_IS_WRITE (data_ref))
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2305 "Two store stmts share the same dr.\n");
2306 return false;
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "Two or more load stmts share the same dr.\n");
2313 /* For load use the same data-ref load. */
2314 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2316 prev = next;
2317 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2318 continue;
2321 prev = next;
2322 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2324 /* All group members have the same STEP by construction. */
2325 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2327 /* Check that the distance between two accesses is equal to the type
2328 size. Otherwise, we have gaps. */
2329 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2330 - TREE_INT_CST_LOW (prev_init)) / type_size;
2331 if (diff != 1)
2333 /* FORNOW: SLP of accesses with gaps is not supported. */
2334 slp_impossible = true;
2335 if (DR_IS_WRITE (data_ref))
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339 "interleaved store with gaps\n");
2340 return false;
2343 gaps += diff - 1;
2346 last_accessed_element += diff;
2348 /* Store the gap from the previous member of the group. If there is no
2349 gap in the access, GROUP_GAP is always 1. */
2350 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2352 prev_init = DR_INIT (data_ref);
2353 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2354 /* Count the number of data-refs in the chain. */
2355 count++;
2358 if (groupsize == 0)
2359 groupsize = count + gaps;
2361 if (groupsize > UINT_MAX)
2363 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2365 "group is too large\n");
2366 return false;
2369 /* Check that the size of the interleaving is equal to count for stores,
2370 i.e., that there are no gaps. */
2371 if (groupsize != count
2372 && !DR_IS_READ (dr))
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2376 "interleaved store with gaps\n");
2377 return false;
2380 /* If there is a gap after the last load in the group it is the
2381 difference between the groupsize and the last accessed
2382 element.
2383 When there is no gap, this difference should be 0. */
2384 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2386 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2387 if (dump_enabled_p ())
2389 dump_printf_loc (MSG_NOTE, vect_location,
2390 "Detected interleaving ");
2391 if (DR_IS_READ (dr))
2392 dump_printf (MSG_NOTE, "load ");
2393 else
2394 dump_printf (MSG_NOTE, "store ");
2395 dump_printf (MSG_NOTE, "of size %u starting with ",
2396 (unsigned)groupsize);
2397 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2398 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2399 dump_printf_loc (MSG_NOTE, vect_location,
2400 "There is a gap of %u elements after the group\n",
2401 GROUP_GAP (vinfo_for_stmt (stmt)));
2404 /* SLP: create an SLP data structure for every interleaving group of
2405 stores for further analysis in vect_analyse_slp. */
2406 if (DR_IS_WRITE (dr) && !slp_impossible)
2408 if (loop_vinfo)
2409 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2410 if (bb_vinfo)
2411 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2415 return true;
2418 /* Analyze groups of accesses: check that DR belongs to a group of
2419 accesses of legal size, step, etc. Detect gaps, single element
2420 interleaving, and other special cases. Set grouped access info.
2421 Collect groups of strided stores for further use in SLP analysis. */
2423 static bool
2424 vect_analyze_group_access (struct data_reference *dr)
2426 if (!vect_analyze_group_access_1 (dr))
2428 /* Dissolve the group if present. */
2429 gimple *next;
2430 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2431 while (stmt)
2433 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2434 next = GROUP_NEXT_ELEMENT (vinfo);
2435 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2436 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2437 stmt = next;
2439 return false;
2441 return true;
2444 /* Analyze the access pattern of the data-reference DR.
2445 In case of non-consecutive accesses call vect_analyze_group_access() to
2446 analyze groups of accesses. */
2448 static bool
2449 vect_analyze_data_ref_access (struct data_reference *dr)
2451 tree step = DR_STEP (dr);
2452 tree scalar_type = TREE_TYPE (DR_REF (dr));
2453 gimple *stmt = DR_STMT (dr);
2454 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2455 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2456 struct loop *loop = NULL;
2458 if (loop_vinfo)
2459 loop = LOOP_VINFO_LOOP (loop_vinfo);
2461 if (loop_vinfo && !step)
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465 "bad data-ref access in loop\n");
2466 return false;
2469 /* Allow loads with zero step in inner-loop vectorization. */
2470 if (loop_vinfo && integer_zerop (step))
2472 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2473 if (!nested_in_vect_loop_p (loop, stmt))
2474 return DR_IS_READ (dr);
2475 /* Allow references with zero step for outer loops marked
2476 with pragma omp simd only - it guarantees absence of
2477 loop-carried dependencies between inner loop iterations. */
2478 if (!loop->force_vectorize)
2480 if (dump_enabled_p ())
2481 dump_printf_loc (MSG_NOTE, vect_location,
2482 "zero step in inner loop of nest\n");
2483 return false;
2487 if (loop && nested_in_vect_loop_p (loop, stmt))
2489 /* Interleaved accesses are not yet supported within outer-loop
2490 vectorization for references in the inner-loop. */
2491 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2493 /* For the rest of the analysis we use the outer-loop step. */
2494 step = STMT_VINFO_DR_STEP (stmt_info);
2495 if (integer_zerop (step))
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_NOTE, vect_location,
2499 "zero step in outer loop.\n");
2500 return DR_IS_READ (dr);
2504 /* Consecutive? */
2505 if (TREE_CODE (step) == INTEGER_CST)
2507 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2508 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2509 || (dr_step < 0
2510 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2512 /* Mark that it is not interleaving. */
2513 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2514 return true;
2518 if (loop && nested_in_vect_loop_p (loop, stmt))
2520 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_NOTE, vect_location,
2522 "grouped access in outer loop.\n");
2523 return false;
2527 /* Assume this is a DR handled by non-constant strided load case. */
2528 if (TREE_CODE (step) != INTEGER_CST)
2529 return (STMT_VINFO_STRIDED_P (stmt_info)
2530 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2531 || vect_analyze_group_access (dr)));
2533 /* Not consecutive access - check if it's a part of interleaving group. */
2534 return vect_analyze_group_access (dr);
2539 /* A helper function used in the comparator function to sort data
2540 references. T1 and T2 are two data references to be compared.
2541 The function returns -1, 0, or 1. */
2543 static int
2544 compare_tree (tree t1, tree t2)
2546 int i, cmp;
2547 enum tree_code code;
2548 char tclass;
2550 if (t1 == t2)
2551 return 0;
2552 if (t1 == NULL)
2553 return -1;
2554 if (t2 == NULL)
2555 return 1;
2557 STRIP_NOPS (t1);
2558 STRIP_NOPS (t2);
2560 if (TREE_CODE (t1) != TREE_CODE (t2))
2561 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2563 code = TREE_CODE (t1);
2564 switch (code)
2566 /* For const values, we can just use hash values for comparisons. */
2567 case INTEGER_CST:
2568 case REAL_CST:
2569 case FIXED_CST:
2570 case STRING_CST:
2571 case COMPLEX_CST:
2572 case VECTOR_CST:
2574 hashval_t h1 = iterative_hash_expr (t1, 0);
2575 hashval_t h2 = iterative_hash_expr (t2, 0);
2576 if (h1 != h2)
2577 return h1 < h2 ? -1 : 1;
2578 break;
2581 case SSA_NAME:
2582 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2583 if (cmp != 0)
2584 return cmp;
2586 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2587 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2588 break;
2590 default:
2591 tclass = TREE_CODE_CLASS (code);
2593 /* For var-decl, we could compare their UIDs. */
2594 if (tclass == tcc_declaration)
2596 if (DECL_UID (t1) != DECL_UID (t2))
2597 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2598 break;
2601 /* For expressions with operands, compare their operands recursively. */
2602 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2604 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2605 if (cmp != 0)
2606 return cmp;
2610 return 0;
2614 /* Compare two data-references DRA and DRB to group them into chunks
2615 suitable for grouping. */
2617 static int
2618 dr_group_sort_cmp (const void *dra_, const void *drb_)
2620 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2621 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2622 int cmp;
2624 /* Stabilize sort. */
2625 if (dra == drb)
2626 return 0;
2628 /* DRs in different loops never belong to the same group. */
2629 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2630 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2631 if (loopa != loopb)
2632 return loopa->num < loopb->num ? -1 : 1;
2634 /* Ordering of DRs according to base. */
2635 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2637 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2638 if (cmp != 0)
2639 return cmp;
2642 /* And according to DR_OFFSET. */
2643 if (!dr_equal_offsets_p (dra, drb))
2645 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2646 if (cmp != 0)
2647 return cmp;
2650 /* Put reads before writes. */
2651 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2652 return DR_IS_READ (dra) ? -1 : 1;
2654 /* Then sort after access size. */
2655 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2656 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2658 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2659 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2660 if (cmp != 0)
2661 return cmp;
2664 /* And after step. */
2665 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2667 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2668 if (cmp != 0)
2669 return cmp;
2672 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2673 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2674 if (cmp == 0)
2675 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2676 return cmp;
2679 /* Function vect_analyze_data_ref_accesses.
2681 Analyze the access pattern of all the data references in the loop.
2683 FORNOW: the only access pattern that is considered vectorizable is a
2684 simple step 1 (consecutive) access.
2686 FORNOW: handle only arrays and pointer accesses. */
2688 bool
2689 vect_analyze_data_ref_accesses (vec_info *vinfo)
2691 unsigned int i;
2692 vec<data_reference_p> datarefs = vinfo->datarefs;
2693 struct data_reference *dr;
2695 if (dump_enabled_p ())
2696 dump_printf_loc (MSG_NOTE, vect_location,
2697 "=== vect_analyze_data_ref_accesses ===\n");
2699 if (datarefs.is_empty ())
2700 return true;
2702 /* Sort the array of datarefs to make building the interleaving chains
2703 linear. Don't modify the original vector's order, it is needed for
2704 determining what dependencies are reversed. */
2705 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2706 datarefs_copy.qsort (dr_group_sort_cmp);
2708 /* Build the interleaving chains. */
2709 for (i = 0; i < datarefs_copy.length () - 1;)
2711 data_reference_p dra = datarefs_copy[i];
2712 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2713 stmt_vec_info lastinfo = NULL;
2714 for (i = i + 1; i < datarefs_copy.length (); ++i)
2716 data_reference_p drb = datarefs_copy[i];
2717 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2719 /* ??? Imperfect sorting (non-compatible types, non-modulo
2720 accesses, same accesses) can lead to a group to be artificially
2721 split here as we don't just skip over those. If it really
2722 matters we can push those to a worklist and re-iterate
2723 over them. The we can just skip ahead to the next DR here. */
2725 /* DRs in a different loop should not be put into the same
2726 interleaving group. */
2727 if (gimple_bb (DR_STMT (dra))->loop_father
2728 != gimple_bb (DR_STMT (drb))->loop_father)
2729 break;
2731 /* Check that the data-refs have same first location (except init)
2732 and they are both either store or load (not load and store,
2733 not masked loads or stores). */
2734 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2735 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2736 DR_BASE_ADDRESS (drb), 0)
2737 || !dr_equal_offsets_p (dra, drb)
2738 || !gimple_assign_single_p (DR_STMT (dra))
2739 || !gimple_assign_single_p (DR_STMT (drb)))
2740 break;
2742 /* Check that the data-refs have the same constant size. */
2743 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2744 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2745 if (!tree_fits_uhwi_p (sza)
2746 || !tree_fits_uhwi_p (szb)
2747 || !tree_int_cst_equal (sza, szb))
2748 break;
2750 /* Check that the data-refs have the same step. */
2751 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2752 break;
2754 /* Do not place the same access in the interleaving chain twice. */
2755 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2756 break;
2758 /* Check the types are compatible.
2759 ??? We don't distinguish this during sorting. */
2760 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2761 TREE_TYPE (DR_REF (drb))))
2762 break;
2764 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2765 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2766 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2767 gcc_assert (init_a <= init_b);
2769 /* If init_b == init_a + the size of the type * k, we have an
2770 interleaving, and DRA is accessed before DRB. */
2771 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2772 if (type_size_a == 0
2773 || (init_b - init_a) % type_size_a != 0)
2774 break;
2776 /* If we have a store, the accesses are adjacent. This splits
2777 groups into chunks we support (we don't support vectorization
2778 of stores with gaps). */
2779 if (!DR_IS_READ (dra)
2780 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2781 (DR_INIT (datarefs_copy[i-1]))
2782 != type_size_a))
2783 break;
2785 /* If the step (if not zero or non-constant) is greater than the
2786 difference between data-refs' inits this splits groups into
2787 suitable sizes. */
2788 if (tree_fits_shwi_p (DR_STEP (dra)))
2790 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2791 if (step != 0 && step <= (init_b - init_a))
2792 break;
2795 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "Detected interleaving ");
2799 if (DR_IS_READ (dra))
2800 dump_printf (MSG_NOTE, "load ");
2801 else
2802 dump_printf (MSG_NOTE, "store ");
2803 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2804 dump_printf (MSG_NOTE, " and ");
2805 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2806 dump_printf (MSG_NOTE, "\n");
2809 /* Link the found element into the group list. */
2810 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2812 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2813 lastinfo = stmtinfo_a;
2815 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2816 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2817 lastinfo = stmtinfo_b;
2821 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2822 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2823 && !vect_analyze_data_ref_access (dr))
2825 if (dump_enabled_p ())
2826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2827 "not vectorized: complicated access pattern.\n");
2829 if (is_a <bb_vec_info> (vinfo))
2831 /* Mark the statement as not vectorizable. */
2832 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2833 continue;
2835 else
2837 datarefs_copy.release ();
2838 return false;
2842 datarefs_copy.release ();
2843 return true;
2847 /* Operator == between two dr_with_seg_len objects.
2849 This equality operator is used to make sure two data refs
2850 are the same one so that we will consider to combine the
2851 aliasing checks of those two pairs of data dependent data
2852 refs. */
2854 static bool
2855 operator == (const dr_with_seg_len& d1,
2856 const dr_with_seg_len& d2)
2858 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2859 DR_BASE_ADDRESS (d2.dr), 0)
2860 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2861 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2862 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2865 /* Function comp_dr_with_seg_len_pair.
2867 Comparison function for sorting objects of dr_with_seg_len_pair_t
2868 so that we can combine aliasing checks in one scan. */
2870 static int
2871 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2873 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2874 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2875 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2876 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2878 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2879 if a and c have the same basic address snd step, and b and d have the same
2880 address and step. Therefore, if any a&c or b&d don't have the same address
2881 and step, we don't care the order of those two pairs after sorting. */
2882 int comp_res;
2884 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2885 DR_BASE_ADDRESS (b1.dr))) != 0)
2886 return comp_res;
2887 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2888 DR_BASE_ADDRESS (b2.dr))) != 0)
2889 return comp_res;
2890 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2891 return comp_res;
2892 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2893 return comp_res;
2894 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2895 return comp_res;
2896 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2897 return comp_res;
2898 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2899 return comp_res;
2900 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2901 return comp_res;
2903 return 0;
2906 /* Function vect_vfa_segment_size.
2908 Create an expression that computes the size of segment
2909 that will be accessed for a data reference. The functions takes into
2910 account that realignment loads may access one more vector.
2912 Input:
2913 DR: The data reference.
2914 LENGTH_FACTOR: segment length to consider.
2916 Return an expression whose value is the size of segment which will be
2917 accessed by DR. */
2919 static tree
2920 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2922 tree segment_length;
2924 if (integer_zerop (DR_STEP (dr)))
2925 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2926 else
2927 segment_length = size_binop (MULT_EXPR,
2928 fold_convert (sizetype, DR_STEP (dr)),
2929 fold_convert (sizetype, length_factor));
2931 if (vect_supportable_dr_alignment (dr, false)
2932 == dr_explicit_realign_optimized)
2934 tree vector_size = TYPE_SIZE_UNIT
2935 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2937 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2939 return segment_length;
2942 /* Function vect_no_alias_p.
2944 Given data references A and B with equal base and offset, the alias
2945 relation can be decided at compilation time, return TRUE if they do
2946 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2947 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2948 respectively. */
2950 static bool
2951 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2952 tree segment_length_a, tree segment_length_b)
2954 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2955 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2956 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2957 return false;
2959 tree seg_a_min = DR_INIT (a);
2960 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2961 seg_a_min, segment_length_a);
2962 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2963 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2964 [a, a+12) */
2965 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2967 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2968 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2969 seg_a_max, unit_size);
2970 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2971 DR_INIT (a), unit_size);
2973 tree seg_b_min = DR_INIT (b);
2974 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2975 seg_b_min, segment_length_b);
2976 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2978 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2979 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2980 seg_b_max, unit_size);
2981 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2982 DR_INIT (b), unit_size);
2985 if (tree_int_cst_le (seg_a_max, seg_b_min)
2986 || tree_int_cst_le (seg_b_max, seg_a_min))
2987 return true;
2989 return false;
2992 /* Function vect_prune_runtime_alias_test_list.
2994 Prune a list of ddrs to be tested at run-time by versioning for alias.
2995 Merge several alias checks into one if possible.
2996 Return FALSE if resulting list of ddrs is longer then allowed by
2997 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2999 bool
3000 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3002 vec<ddr_p> may_alias_ddrs =
3003 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3004 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3005 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3006 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3007 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3009 ddr_p ddr;
3010 unsigned int i;
3011 tree length_factor;
3013 if (dump_enabled_p ())
3014 dump_printf_loc (MSG_NOTE, vect_location,
3015 "=== vect_prune_runtime_alias_test_list ===\n");
3017 if (may_alias_ddrs.is_empty ())
3018 return true;
3020 /* Basically, for each pair of dependent data refs store_ptr_0
3021 and load_ptr_0, we create an expression:
3023 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3024 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3026 for aliasing checks. However, in some cases we can decrease
3027 the number of checks by combining two checks into one. For
3028 example, suppose we have another pair of data refs store_ptr_0
3029 and load_ptr_1, and if the following condition is satisfied:
3031 load_ptr_0 < load_ptr_1 &&
3032 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3034 (this condition means, in each iteration of vectorized loop,
3035 the accessed memory of store_ptr_0 cannot be between the memory
3036 of load_ptr_0 and load_ptr_1.)
3038 we then can use only the following expression to finish the
3039 alising checks between store_ptr_0 & load_ptr_0 and
3040 store_ptr_0 & load_ptr_1:
3042 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3043 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3045 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3046 same basic address. */
3048 comp_alias_ddrs.create (may_alias_ddrs.length ());
3050 /* First, we collect all data ref pairs for aliasing checks. */
3051 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3053 int comp_res;
3054 struct data_reference *dr_a, *dr_b;
3055 gimple *dr_group_first_a, *dr_group_first_b;
3056 tree segment_length_a, segment_length_b;
3057 gimple *stmt_a, *stmt_b;
3059 dr_a = DDR_A (ddr);
3060 stmt_a = DR_STMT (DDR_A (ddr));
3061 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3062 if (dr_group_first_a)
3064 stmt_a = dr_group_first_a;
3065 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3068 dr_b = DDR_B (ddr);
3069 stmt_b = DR_STMT (DDR_B (ddr));
3070 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3071 if (dr_group_first_b)
3073 stmt_b = dr_group_first_b;
3074 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3077 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3078 length_factor = scalar_loop_iters;
3079 else
3080 length_factor = size_int (vect_factor);
3081 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3082 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3084 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3085 if (comp_res == 0)
3086 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3088 /* Alias is known at compilation time. */
3089 if (comp_res == 0
3090 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3091 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3092 && TREE_CODE (segment_length_a) == INTEGER_CST
3093 && TREE_CODE (segment_length_b) == INTEGER_CST)
3095 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3096 continue;
3098 if (dump_enabled_p ())
3099 dump_printf_loc (MSG_NOTE, vect_location,
3100 "not vectorized: compilation time alias.\n");
3102 return false;
3105 dr_with_seg_len_pair_t dr_with_seg_len_pair
3106 (dr_with_seg_len (dr_a, segment_length_a),
3107 dr_with_seg_len (dr_b, segment_length_b));
3109 /* Canonicalize pairs by sorting the two DR members. */
3110 if (comp_res > 0)
3111 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3113 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3116 /* Second, we sort the collected data ref pairs so that we can scan
3117 them once to combine all possible aliasing checks. */
3118 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3120 /* Third, we scan the sorted dr pairs and check if we can combine
3121 alias checks of two neighboring dr pairs. */
3122 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3124 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3125 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3126 *dr_b1 = &comp_alias_ddrs[i-1].second,
3127 *dr_a2 = &comp_alias_ddrs[i].first,
3128 *dr_b2 = &comp_alias_ddrs[i].second;
3130 /* Remove duplicate data ref pairs. */
3131 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3133 if (dump_enabled_p ())
3135 dump_printf_loc (MSG_NOTE, vect_location,
3136 "found equal ranges ");
3137 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138 DR_REF (dr_a1->dr));
3139 dump_printf (MSG_NOTE, ", ");
3140 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3141 DR_REF (dr_b1->dr));
3142 dump_printf (MSG_NOTE, " and ");
3143 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3144 DR_REF (dr_a2->dr));
3145 dump_printf (MSG_NOTE, ", ");
3146 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3147 DR_REF (dr_b2->dr));
3148 dump_printf (MSG_NOTE, "\n");
3151 comp_alias_ddrs.ordered_remove (i--);
3152 continue;
3155 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3157 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3158 and DR_A1 and DR_A2 are two consecutive memrefs. */
3159 if (*dr_a1 == *dr_a2)
3161 std::swap (dr_a1, dr_b1);
3162 std::swap (dr_a2, dr_b2);
3165 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3166 DR_BASE_ADDRESS (dr_a2->dr), 0)
3167 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3168 DR_OFFSET (dr_a2->dr), 0)
3169 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3170 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3171 continue;
3173 /* Make sure dr_a1 starts left of dr_a2. */
3174 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3175 std::swap (*dr_a1, *dr_a2);
3177 bool do_remove = false;
3178 unsigned HOST_WIDE_INT diff
3179 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3180 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3182 /* If the left segment does not extend beyond the start of the
3183 right segment the new segment length is that of the right
3184 plus the segment distance. */
3185 if (tree_fits_uhwi_p (dr_a1->seg_len)
3186 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3188 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3189 size_int (diff));
3190 do_remove = true;
3192 /* Generally the new segment length is the maximum of the
3193 left segment size and the right segment size plus the distance.
3194 ??? We can also build tree MAX_EXPR here but it's not clear this
3195 is profitable. */
3196 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3197 && tree_fits_uhwi_p (dr_a2->seg_len))
3199 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3200 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3201 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3202 do_remove = true;
3204 /* Now we check if the following condition is satisfied:
3206 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3208 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3209 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3210 have to make a best estimation. We can get the minimum value
3211 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3212 then either of the following two conditions can guarantee the
3213 one above:
3215 1: DIFF <= MIN_SEG_LEN_B
3216 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3217 else
3219 unsigned HOST_WIDE_INT min_seg_len_b
3220 = (tree_fits_uhwi_p (dr_b1->seg_len)
3221 ? tree_to_uhwi (dr_b1->seg_len)
3222 : vect_factor);
3224 if (diff <= min_seg_len_b
3225 || (tree_fits_uhwi_p (dr_a1->seg_len)
3226 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3228 dr_a1->seg_len = size_binop (PLUS_EXPR,
3229 dr_a2->seg_len, size_int (diff));
3230 do_remove = true;
3234 if (do_remove)
3236 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_NOTE, vect_location,
3239 "merging ranges for ");
3240 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3241 dump_printf (MSG_NOTE, ", ");
3242 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3243 dump_printf (MSG_NOTE, " and ");
3244 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3245 dump_printf (MSG_NOTE, ", ");
3246 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3247 dump_printf (MSG_NOTE, "\n");
3249 comp_alias_ddrs.ordered_remove (i--);
3254 dump_printf_loc (MSG_NOTE, vect_location,
3255 "improved number of alias checks from %d to %d\n",
3256 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3257 if ((int) comp_alias_ddrs.length () >
3258 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3262 "number of versioning for alias "
3263 "run-time tests exceeds %d "
3264 "(--param vect-max-version-for-alias-checks)\n",
3265 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3266 return false;
3269 /* All alias checks have been resolved at compilation time. */
3270 if (!comp_alias_ddrs.length ())
3271 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3273 return true;
3276 /* Return true if a non-affine read or write in STMT is suitable for a
3277 gather load or scatter store. Describe the operation in *INFO if so. */
3279 bool
3280 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3281 gather_scatter_info *info)
3283 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3284 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3285 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3286 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3287 tree offtype = NULL_TREE;
3288 tree decl, base, off;
3289 machine_mode pmode;
3290 int punsignedp, reversep, pvolatilep = 0;
3292 base = DR_REF (dr);
3293 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3294 see if we can use the def stmt of the address. */
3295 if (is_gimple_call (stmt)
3296 && gimple_call_internal_p (stmt)
3297 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3298 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3299 && TREE_CODE (base) == MEM_REF
3300 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3301 && integer_zerop (TREE_OPERAND (base, 1))
3302 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3304 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3305 if (is_gimple_assign (def_stmt)
3306 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3307 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3310 /* The gather and scatter builtins need address of the form
3311 loop_invariant + vector * {1, 2, 4, 8}
3313 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3314 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3315 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3316 multiplications and additions in it. To get a vector, we need
3317 a single SSA_NAME that will be defined in the loop and will
3318 contain everything that is not loop invariant and that can be
3319 vectorized. The following code attempts to find such a preexistng
3320 SSA_NAME OFF and put the loop invariants into a tree BASE
3321 that can be gimplified before the loop. */
3322 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3323 &punsignedp, &reversep, &pvolatilep);
3324 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3326 if (TREE_CODE (base) == MEM_REF)
3328 if (!integer_zerop (TREE_OPERAND (base, 1)))
3330 if (off == NULL_TREE)
3332 offset_int moff = mem_ref_offset (base);
3333 off = wide_int_to_tree (sizetype, moff);
3335 else
3336 off = size_binop (PLUS_EXPR, off,
3337 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3339 base = TREE_OPERAND (base, 0);
3341 else
3342 base = build_fold_addr_expr (base);
3344 if (off == NULL_TREE)
3345 off = size_zero_node;
3347 /* If base is not loop invariant, either off is 0, then we start with just
3348 the constant offset in the loop invariant BASE and continue with base
3349 as OFF, otherwise give up.
3350 We could handle that case by gimplifying the addition of base + off
3351 into some SSA_NAME and use that as off, but for now punt. */
3352 if (!expr_invariant_in_loop_p (loop, base))
3354 if (!integer_zerop (off))
3355 return false;
3356 off = base;
3357 base = size_int (pbitpos / BITS_PER_UNIT);
3359 /* Otherwise put base + constant offset into the loop invariant BASE
3360 and continue with OFF. */
3361 else
3363 base = fold_convert (sizetype, base);
3364 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3367 /* OFF at this point may be either a SSA_NAME or some tree expression
3368 from get_inner_reference. Try to peel off loop invariants from it
3369 into BASE as long as possible. */
3370 STRIP_NOPS (off);
3371 while (offtype == NULL_TREE)
3373 enum tree_code code;
3374 tree op0, op1, add = NULL_TREE;
3376 if (TREE_CODE (off) == SSA_NAME)
3378 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3380 if (expr_invariant_in_loop_p (loop, off))
3381 return false;
3383 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3384 break;
3386 op0 = gimple_assign_rhs1 (def_stmt);
3387 code = gimple_assign_rhs_code (def_stmt);
3388 op1 = gimple_assign_rhs2 (def_stmt);
3390 else
3392 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3393 return false;
3394 code = TREE_CODE (off);
3395 extract_ops_from_tree (off, &code, &op0, &op1);
3397 switch (code)
3399 case POINTER_PLUS_EXPR:
3400 case PLUS_EXPR:
3401 if (expr_invariant_in_loop_p (loop, op0))
3403 add = op0;
3404 off = op1;
3405 do_add:
3406 add = fold_convert (sizetype, add);
3407 if (scale != 1)
3408 add = size_binop (MULT_EXPR, add, size_int (scale));
3409 base = size_binop (PLUS_EXPR, base, add);
3410 continue;
3412 if (expr_invariant_in_loop_p (loop, op1))
3414 add = op1;
3415 off = op0;
3416 goto do_add;
3418 break;
3419 case MINUS_EXPR:
3420 if (expr_invariant_in_loop_p (loop, op1))
3422 add = fold_convert (sizetype, op1);
3423 add = size_binop (MINUS_EXPR, size_zero_node, add);
3424 off = op0;
3425 goto do_add;
3427 break;
3428 case MULT_EXPR:
3429 if (scale == 1 && tree_fits_shwi_p (op1))
3431 scale = tree_to_shwi (op1);
3432 off = op0;
3433 continue;
3435 break;
3436 case SSA_NAME:
3437 off = op0;
3438 continue;
3439 CASE_CONVERT:
3440 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3441 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3442 break;
3443 if (TYPE_PRECISION (TREE_TYPE (op0))
3444 == TYPE_PRECISION (TREE_TYPE (off)))
3446 off = op0;
3447 continue;
3449 if (TYPE_PRECISION (TREE_TYPE (op0))
3450 < TYPE_PRECISION (TREE_TYPE (off)))
3452 off = op0;
3453 offtype = TREE_TYPE (off);
3454 STRIP_NOPS (off);
3455 continue;
3457 break;
3458 default:
3459 break;
3461 break;
3464 /* If at the end OFF still isn't a SSA_NAME or isn't
3465 defined in the loop, punt. */
3466 if (TREE_CODE (off) != SSA_NAME
3467 || expr_invariant_in_loop_p (loop, off))
3468 return false;
3470 if (offtype == NULL_TREE)
3471 offtype = TREE_TYPE (off);
3473 if (DR_IS_READ (dr))
3474 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3475 offtype, scale);
3476 else
3477 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3478 offtype, scale);
3480 if (decl == NULL_TREE)
3481 return false;
3483 info->decl = decl;
3484 info->base = base;
3485 info->offset = off;
3486 info->offset_dt = vect_unknown_def_type;
3487 info->offset_vectype = NULL_TREE;
3488 info->scale = scale;
3489 return true;
3492 /* Function vect_analyze_data_refs.
3494 Find all the data references in the loop or basic block.
3496 The general structure of the analysis of data refs in the vectorizer is as
3497 follows:
3498 1- vect_analyze_data_refs(loop/bb): call
3499 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3500 in the loop/bb and their dependences.
3501 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3502 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3503 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3507 bool
3508 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3510 struct loop *loop = NULL;
3511 unsigned int i;
3512 struct data_reference *dr;
3513 tree scalar_type;
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE, vect_location,
3517 "=== vect_analyze_data_refs ===\n");
3519 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3520 loop = LOOP_VINFO_LOOP (loop_vinfo);
3522 /* Go through the data-refs, check that the analysis succeeded. Update
3523 pointer from stmt_vec_info struct to DR and vectype. */
3525 vec<data_reference_p> datarefs = vinfo->datarefs;
3526 FOR_EACH_VEC_ELT (datarefs, i, dr)
3528 gimple *stmt;
3529 stmt_vec_info stmt_info;
3530 tree base, offset, init;
3531 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3532 bool simd_lane_access = false;
3533 int vf;
3535 again:
3536 if (!dr || !DR_REF (dr))
3538 if (dump_enabled_p ())
3539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3540 "not vectorized: unhandled data-ref\n");
3541 return false;
3544 stmt = DR_STMT (dr);
3545 stmt_info = vinfo_for_stmt (stmt);
3547 /* Discard clobbers from the dataref vector. We will remove
3548 clobber stmts during vectorization. */
3549 if (gimple_clobber_p (stmt))
3551 free_data_ref (dr);
3552 if (i == datarefs.length () - 1)
3554 datarefs.pop ();
3555 break;
3557 datarefs.ordered_remove (i);
3558 dr = datarefs[i];
3559 goto again;
3562 /* Check that analysis of the data-ref succeeded. */
3563 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3564 || !DR_STEP (dr))
3566 bool maybe_gather
3567 = DR_IS_READ (dr)
3568 && !TREE_THIS_VOLATILE (DR_REF (dr))
3569 && targetm.vectorize.builtin_gather != NULL;
3570 bool maybe_scatter
3571 = DR_IS_WRITE (dr)
3572 && !TREE_THIS_VOLATILE (DR_REF (dr))
3573 && targetm.vectorize.builtin_scatter != NULL;
3574 bool maybe_simd_lane_access
3575 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3577 /* If target supports vector gather loads or scatter stores, or if
3578 this might be a SIMD lane access, see if they can't be used. */
3579 if (is_a <loop_vec_info> (vinfo)
3580 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3581 && !nested_in_vect_loop_p (loop, stmt))
3583 struct data_reference *newdr
3584 = create_data_ref (NULL, loop_containing_stmt (stmt),
3585 DR_REF (dr), stmt, maybe_scatter ? false : true);
3586 gcc_assert (newdr != NULL && DR_REF (newdr));
3587 if (DR_BASE_ADDRESS (newdr)
3588 && DR_OFFSET (newdr)
3589 && DR_INIT (newdr)
3590 && DR_STEP (newdr)
3591 && integer_zerop (DR_STEP (newdr)))
3593 if (maybe_simd_lane_access)
3595 tree off = DR_OFFSET (newdr);
3596 STRIP_NOPS (off);
3597 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3598 && TREE_CODE (off) == MULT_EXPR
3599 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3601 tree step = TREE_OPERAND (off, 1);
3602 off = TREE_OPERAND (off, 0);
3603 STRIP_NOPS (off);
3604 if (CONVERT_EXPR_P (off)
3605 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3606 0)))
3607 < TYPE_PRECISION (TREE_TYPE (off)))
3608 off = TREE_OPERAND (off, 0);
3609 if (TREE_CODE (off) == SSA_NAME)
3611 gimple *def = SSA_NAME_DEF_STMT (off);
3612 tree reft = TREE_TYPE (DR_REF (newdr));
3613 if (is_gimple_call (def)
3614 && gimple_call_internal_p (def)
3615 && (gimple_call_internal_fn (def)
3616 == IFN_GOMP_SIMD_LANE))
3618 tree arg = gimple_call_arg (def, 0);
3619 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3620 arg = SSA_NAME_VAR (arg);
3621 if (arg == loop->simduid
3622 /* For now. */
3623 && tree_int_cst_equal
3624 (TYPE_SIZE_UNIT (reft),
3625 step))
3627 DR_OFFSET (newdr) = ssize_int (0);
3628 DR_STEP (newdr) = step;
3629 DR_ALIGNED_TO (newdr)
3630 = size_int (BIGGEST_ALIGNMENT);
3631 dr = newdr;
3632 simd_lane_access = true;
3638 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3640 dr = newdr;
3641 if (maybe_gather)
3642 gatherscatter = GATHER;
3643 else
3644 gatherscatter = SCATTER;
3647 if (gatherscatter == SG_NONE && !simd_lane_access)
3648 free_data_ref (newdr);
3651 if (gatherscatter == SG_NONE && !simd_lane_access)
3653 if (dump_enabled_p ())
3655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3656 "not vectorized: data ref analysis "
3657 "failed ");
3658 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3661 if (is_a <bb_vec_info> (vinfo))
3662 break;
3664 return false;
3668 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3670 if (dump_enabled_p ())
3671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3672 "not vectorized: base addr of dr is a "
3673 "constant\n");
3675 if (is_a <bb_vec_info> (vinfo))
3676 break;
3678 if (gatherscatter != SG_NONE || simd_lane_access)
3679 free_data_ref (dr);
3680 return false;
3683 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3685 if (dump_enabled_p ())
3687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3688 "not vectorized: volatile type ");
3689 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3692 if (is_a <bb_vec_info> (vinfo))
3693 break;
3695 return false;
3698 if (stmt_can_throw_internal (stmt))
3700 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3703 "not vectorized: statement can throw an "
3704 "exception ");
3705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3708 if (is_a <bb_vec_info> (vinfo))
3709 break;
3711 if (gatherscatter != SG_NONE || simd_lane_access)
3712 free_data_ref (dr);
3713 return false;
3716 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3717 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3719 if (dump_enabled_p ())
3721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3722 "not vectorized: statement is bitfield "
3723 "access ");
3724 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3727 if (is_a <bb_vec_info> (vinfo))
3728 break;
3730 if (gatherscatter != SG_NONE || simd_lane_access)
3731 free_data_ref (dr);
3732 return false;
3735 base = unshare_expr (DR_BASE_ADDRESS (dr));
3736 offset = unshare_expr (DR_OFFSET (dr));
3737 init = unshare_expr (DR_INIT (dr));
3739 if (is_gimple_call (stmt)
3740 && (!gimple_call_internal_p (stmt)
3741 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3742 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3744 if (dump_enabled_p ())
3746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3747 "not vectorized: dr in a call ");
3748 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3751 if (is_a <bb_vec_info> (vinfo))
3752 break;
3754 if (gatherscatter != SG_NONE || simd_lane_access)
3755 free_data_ref (dr);
3756 return false;
3759 /* Update DR field in stmt_vec_info struct. */
3761 /* If the dataref is in an inner-loop of the loop that is considered for
3762 for vectorization, we also want to analyze the access relative to
3763 the outer-loop (DR contains information only relative to the
3764 inner-most enclosing loop). We do that by building a reference to the
3765 first location accessed by the inner-loop, and analyze it relative to
3766 the outer-loop. */
3767 if (loop && nested_in_vect_loop_p (loop, stmt))
3769 tree outer_step, outer_base, outer_init;
3770 HOST_WIDE_INT pbitsize, pbitpos;
3771 tree poffset;
3772 machine_mode pmode;
3773 int punsignedp, preversep, pvolatilep;
3774 affine_iv base_iv, offset_iv;
3775 tree dinit;
3777 /* Build a reference to the first location accessed by the
3778 inner-loop: *(BASE+INIT). (The first location is actually
3779 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3780 tree inner_base = build_fold_indirect_ref
3781 (fold_build_pointer_plus (base, init));
3783 if (dump_enabled_p ())
3785 dump_printf_loc (MSG_NOTE, vect_location,
3786 "analyze in outer-loop: ");
3787 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3788 dump_printf (MSG_NOTE, "\n");
3791 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3792 &poffset, &pmode, &punsignedp,
3793 &preversep, &pvolatilep);
3794 gcc_assert (outer_base != NULL_TREE);
3796 if (pbitpos % BITS_PER_UNIT != 0)
3798 if (dump_enabled_p ())
3799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3800 "failed: bit offset alignment.\n");
3801 return false;
3804 if (preversep)
3806 if (dump_enabled_p ())
3807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3808 "failed: reverse storage order.\n");
3809 return false;
3812 outer_base = build_fold_addr_expr (outer_base);
3813 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3814 &base_iv, false))
3816 if (dump_enabled_p ())
3817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3818 "failed: evolution of base is not affine.\n");
3819 return false;
3822 if (offset)
3824 if (poffset)
3825 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3826 poffset);
3827 else
3828 poffset = offset;
3831 if (!poffset)
3833 offset_iv.base = ssize_int (0);
3834 offset_iv.step = ssize_int (0);
3836 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3837 &offset_iv, false))
3839 if (dump_enabled_p ())
3840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3841 "evolution of offset is not affine.\n");
3842 return false;
3845 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3846 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3847 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3848 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3849 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3851 outer_step = size_binop (PLUS_EXPR,
3852 fold_convert (ssizetype, base_iv.step),
3853 fold_convert (ssizetype, offset_iv.step));
3855 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3856 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3857 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3858 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3859 STMT_VINFO_DR_OFFSET (stmt_info) =
3860 fold_convert (ssizetype, offset_iv.base);
3861 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3862 size_int (highest_pow2_factor (offset_iv.base));
3864 if (dump_enabled_p ())
3866 dump_printf_loc (MSG_NOTE, vect_location,
3867 "\touter base_address: ");
3868 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3869 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3870 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3871 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3872 STMT_VINFO_DR_OFFSET (stmt_info));
3873 dump_printf (MSG_NOTE,
3874 "\n\touter constant offset from base address: ");
3875 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3876 STMT_VINFO_DR_INIT (stmt_info));
3877 dump_printf (MSG_NOTE, "\n\touter step: ");
3878 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3879 STMT_VINFO_DR_STEP (stmt_info));
3880 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3881 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3882 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3883 dump_printf (MSG_NOTE, "\n");
3887 if (STMT_VINFO_DATA_REF (stmt_info))
3889 if (dump_enabled_p ())
3891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3892 "not vectorized: more than one data ref "
3893 "in stmt: ");
3894 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3897 if (is_a <bb_vec_info> (vinfo))
3898 break;
3900 if (gatherscatter != SG_NONE || simd_lane_access)
3901 free_data_ref (dr);
3902 return false;
3905 STMT_VINFO_DATA_REF (stmt_info) = dr;
3906 if (simd_lane_access)
3908 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3909 free_data_ref (datarefs[i]);
3910 datarefs[i] = dr;
3913 /* Set vectype for STMT. */
3914 scalar_type = TREE_TYPE (DR_REF (dr));
3915 STMT_VINFO_VECTYPE (stmt_info)
3916 = get_vectype_for_scalar_type (scalar_type);
3917 if (!STMT_VINFO_VECTYPE (stmt_info))
3919 if (dump_enabled_p ())
3921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3922 "not vectorized: no vectype for stmt: ");
3923 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3924 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3925 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3926 scalar_type);
3927 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3930 if (is_a <bb_vec_info> (vinfo))
3932 /* No vector type is fine, the ref can still participate
3933 in dependence analysis, we just can't vectorize it. */
3934 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3935 continue;
3938 if (gatherscatter != SG_NONE || simd_lane_access)
3940 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3941 if (gatherscatter != SG_NONE)
3942 free_data_ref (dr);
3944 return false;
3946 else
3948 if (dump_enabled_p ())
3950 dump_printf_loc (MSG_NOTE, vect_location,
3951 "got vectype for stmt: ");
3952 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3953 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3954 STMT_VINFO_VECTYPE (stmt_info));
3955 dump_printf (MSG_NOTE, "\n");
3959 /* Adjust the minimal vectorization factor according to the
3960 vector type. */
3961 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3962 if (vf > *min_vf)
3963 *min_vf = vf;
3965 if (gatherscatter != SG_NONE)
3967 gather_scatter_info gs_info;
3968 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3969 &gs_info)
3970 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3972 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3973 free_data_ref (dr);
3974 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977 (gatherscatter == GATHER) ?
3978 "not vectorized: not suitable for gather "
3979 "load " :
3980 "not vectorized: not suitable for scatter "
3981 "store ");
3982 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3984 return false;
3987 free_data_ref (datarefs[i]);
3988 datarefs[i] = dr;
3989 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3992 else if (is_a <loop_vec_info> (vinfo)
3993 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3995 if (nested_in_vect_loop_p (loop, stmt))
3997 if (dump_enabled_p ())
3999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4000 "not vectorized: not suitable for strided "
4001 "load ");
4002 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4004 return false;
4006 STMT_VINFO_STRIDED_P (stmt_info) = true;
4010 /* If we stopped analysis at the first dataref we could not analyze
4011 when trying to vectorize a basic-block mark the rest of the datarefs
4012 as not vectorizable and truncate the vector of datarefs. That
4013 avoids spending useless time in analyzing their dependence. */
4014 if (i != datarefs.length ())
4016 gcc_assert (is_a <bb_vec_info> (vinfo));
4017 for (unsigned j = i; j < datarefs.length (); ++j)
4019 data_reference_p dr = datarefs[j];
4020 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4021 free_data_ref (dr);
4023 datarefs.truncate (i);
4026 return true;
4030 /* Function vect_get_new_vect_var.
4032 Returns a name for a new variable. The current naming scheme appends the
4033 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4034 the name of vectorizer generated variables, and appends that to NAME if
4035 provided. */
4037 tree
4038 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4040 const char *prefix;
4041 tree new_vect_var;
4043 switch (var_kind)
4045 case vect_simple_var:
4046 prefix = "vect";
4047 break;
4048 case vect_scalar_var:
4049 prefix = "stmp";
4050 break;
4051 case vect_mask_var:
4052 prefix = "mask";
4053 break;
4054 case vect_pointer_var:
4055 prefix = "vectp";
4056 break;
4057 default:
4058 gcc_unreachable ();
4061 if (name)
4063 char* tmp = concat (prefix, "_", name, NULL);
4064 new_vect_var = create_tmp_reg (type, tmp);
4065 free (tmp);
4067 else
4068 new_vect_var = create_tmp_reg (type, prefix);
4070 return new_vect_var;
4073 /* Like vect_get_new_vect_var but return an SSA name. */
4075 tree
4076 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4078 const char *prefix;
4079 tree new_vect_var;
4081 switch (var_kind)
4083 case vect_simple_var:
4084 prefix = "vect";
4085 break;
4086 case vect_scalar_var:
4087 prefix = "stmp";
4088 break;
4089 case vect_pointer_var:
4090 prefix = "vectp";
4091 break;
4092 default:
4093 gcc_unreachable ();
4096 if (name)
4098 char* tmp = concat (prefix, "_", name, NULL);
4099 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4100 free (tmp);
4102 else
4103 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4105 return new_vect_var;
4108 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4110 static void
4111 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4112 stmt_vec_info stmt_info)
4114 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4115 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4116 int misalign = DR_MISALIGNMENT (dr);
4117 if (misalign == -1)
4118 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4119 else
4120 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4123 /* Function vect_create_addr_base_for_vector_ref.
4125 Create an expression that computes the address of the first memory location
4126 that will be accessed for a data reference.
4128 Input:
4129 STMT: The statement containing the data reference.
4130 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4131 OFFSET: Optional. If supplied, it is be added to the initial address.
4132 LOOP: Specify relative to which loop-nest should the address be computed.
4133 For example, when the dataref is in an inner-loop nested in an
4134 outer-loop that is now being vectorized, LOOP can be either the
4135 outer-loop, or the inner-loop. The first memory location accessed
4136 by the following dataref ('in' points to short):
4138 for (i=0; i<N; i++)
4139 for (j=0; j<M; j++)
4140 s += in[i+j]
4142 is as follows:
4143 if LOOP=i_loop: &in (relative to i_loop)
4144 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4145 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4146 initial address. Unlike OFFSET, which is number of elements to
4147 be added, BYTE_OFFSET is measured in bytes.
4149 Output:
4150 1. Return an SSA_NAME whose value is the address of the memory location of
4151 the first vector of the data reference.
4152 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4153 these statement(s) which define the returned SSA_NAME.
4155 FORNOW: We are only handling array accesses with step 1. */
4157 tree
4158 vect_create_addr_base_for_vector_ref (gimple *stmt,
4159 gimple_seq *new_stmt_list,
4160 tree offset,
4161 struct loop *loop,
4162 tree byte_offset)
4164 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4165 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4166 tree data_ref_base;
4167 const char *base_name;
4168 tree addr_base;
4169 tree dest;
4170 gimple_seq seq = NULL;
4171 tree base_offset;
4172 tree init;
4173 tree vect_ptr_type;
4174 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4175 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4177 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4179 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4181 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4183 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4184 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4185 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4187 else
4189 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4190 base_offset = unshare_expr (DR_OFFSET (dr));
4191 init = unshare_expr (DR_INIT (dr));
4194 if (loop_vinfo)
4195 base_name = get_name (data_ref_base);
4196 else
4198 base_offset = ssize_int (0);
4199 init = ssize_int (0);
4200 base_name = get_name (DR_REF (dr));
4203 /* Create base_offset */
4204 base_offset = size_binop (PLUS_EXPR,
4205 fold_convert (sizetype, base_offset),
4206 fold_convert (sizetype, init));
4208 if (offset)
4210 offset = fold_build2 (MULT_EXPR, sizetype,
4211 fold_convert (sizetype, offset), step);
4212 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4213 base_offset, offset);
4215 if (byte_offset)
4217 byte_offset = fold_convert (sizetype, byte_offset);
4218 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4219 base_offset, byte_offset);
4222 /* base + base_offset */
4223 if (loop_vinfo)
4224 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4225 else
4227 addr_base = build1 (ADDR_EXPR,
4228 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4229 unshare_expr (DR_REF (dr)));
4232 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4233 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4234 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4235 gimple_seq_add_seq (new_stmt_list, seq);
4237 if (DR_PTR_INFO (dr)
4238 && TREE_CODE (addr_base) == SSA_NAME
4239 && !SSA_NAME_PTR_INFO (addr_base))
4241 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4242 if (offset || byte_offset)
4243 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4246 if (dump_enabled_p ())
4248 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4249 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4250 dump_printf (MSG_NOTE, "\n");
4253 return addr_base;
4257 /* Function vect_create_data_ref_ptr.
4259 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4260 location accessed in the loop by STMT, along with the def-use update
4261 chain to appropriately advance the pointer through the loop iterations.
4262 Also set aliasing information for the pointer. This pointer is used by
4263 the callers to this function to create a memory reference expression for
4264 vector load/store access.
4266 Input:
4267 1. STMT: a stmt that references memory. Expected to be of the form
4268 GIMPLE_ASSIGN <name, data-ref> or
4269 GIMPLE_ASSIGN <data-ref, name>.
4270 2. AGGR_TYPE: the type of the reference, which should be either a vector
4271 or an array.
4272 3. AT_LOOP: the loop where the vector memref is to be created.
4273 4. OFFSET (optional): an offset to be added to the initial address accessed
4274 by the data-ref in STMT.
4275 5. BSI: location where the new stmts are to be placed if there is no loop
4276 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4277 pointing to the initial address.
4278 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4279 to the initial address accessed by the data-ref in STMT. This is
4280 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4281 in bytes.
4283 Output:
4284 1. Declare a new ptr to vector_type, and have it point to the base of the
4285 data reference (initial addressed accessed by the data reference).
4286 For example, for vector of type V8HI, the following code is generated:
4288 v8hi *ap;
4289 ap = (v8hi *)initial_address;
4291 if OFFSET is not supplied:
4292 initial_address = &a[init];
4293 if OFFSET is supplied:
4294 initial_address = &a[init + OFFSET];
4295 if BYTE_OFFSET is supplied:
4296 initial_address = &a[init] + BYTE_OFFSET;
4298 Return the initial_address in INITIAL_ADDRESS.
4300 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4301 update the pointer in each iteration of the loop.
4303 Return the increment stmt that updates the pointer in PTR_INCR.
4305 3. Set INV_P to true if the access pattern of the data reference in the
4306 vectorized loop is invariant. Set it to false otherwise.
4308 4. Return the pointer. */
4310 tree
4311 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4312 tree offset, tree *initial_address,
4313 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4314 bool only_init, bool *inv_p, tree byte_offset)
4316 const char *base_name;
4317 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4318 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4319 struct loop *loop = NULL;
4320 bool nested_in_vect_loop = false;
4321 struct loop *containing_loop = NULL;
4322 tree aggr_ptr_type;
4323 tree aggr_ptr;
4324 tree new_temp;
4325 gimple_seq new_stmt_list = NULL;
4326 edge pe = NULL;
4327 basic_block new_bb;
4328 tree aggr_ptr_init;
4329 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4330 tree aptr;
4331 gimple_stmt_iterator incr_gsi;
4332 bool insert_after;
4333 tree indx_before_incr, indx_after_incr;
4334 gimple *incr;
4335 tree step;
4336 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4338 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4339 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4341 if (loop_vinfo)
4343 loop = LOOP_VINFO_LOOP (loop_vinfo);
4344 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4345 containing_loop = (gimple_bb (stmt))->loop_father;
4346 pe = loop_preheader_edge (loop);
4348 else
4350 gcc_assert (bb_vinfo);
4351 only_init = true;
4352 *ptr_incr = NULL;
4355 /* Check the step (evolution) of the load in LOOP, and record
4356 whether it's invariant. */
4357 if (nested_in_vect_loop)
4358 step = STMT_VINFO_DR_STEP (stmt_info);
4359 else
4360 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4362 if (integer_zerop (step))
4363 *inv_p = true;
4364 else
4365 *inv_p = false;
4367 /* Create an expression for the first address accessed by this load
4368 in LOOP. */
4369 base_name = get_name (DR_BASE_ADDRESS (dr));
4371 if (dump_enabled_p ())
4373 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4374 dump_printf_loc (MSG_NOTE, vect_location,
4375 "create %s-pointer variable to type: ",
4376 get_tree_code_name (TREE_CODE (aggr_type)));
4377 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4378 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4379 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4380 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4381 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4382 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4383 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4384 else
4385 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4386 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4387 dump_printf (MSG_NOTE, "\n");
4390 /* (1) Create the new aggregate-pointer variable.
4391 Vector and array types inherit the alias set of their component
4392 type by default so we need to use a ref-all pointer if the data
4393 reference does not conflict with the created aggregated data
4394 reference because it is not addressable. */
4395 bool need_ref_all = false;
4396 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4397 get_alias_set (DR_REF (dr))))
4398 need_ref_all = true;
4399 /* Likewise for any of the data references in the stmt group. */
4400 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4402 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4405 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4406 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4407 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4408 get_alias_set (DR_REF (sdr))))
4410 need_ref_all = true;
4411 break;
4413 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4415 while (orig_stmt);
4417 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4418 need_ref_all);
4419 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4422 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4423 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4424 def-use update cycles for the pointer: one relative to the outer-loop
4425 (LOOP), which is what steps (3) and (4) below do. The other is relative
4426 to the inner-loop (which is the inner-most loop containing the dataref),
4427 and this is done be step (5) below.
4429 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4430 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4431 redundant. Steps (3),(4) create the following:
4433 vp0 = &base_addr;
4434 LOOP: vp1 = phi(vp0,vp2)
4437 vp2 = vp1 + step
4438 goto LOOP
4440 If there is an inner-loop nested in loop, then step (5) will also be
4441 applied, and an additional update in the inner-loop will be created:
4443 vp0 = &base_addr;
4444 LOOP: vp1 = phi(vp0,vp2)
4446 inner: vp3 = phi(vp1,vp4)
4447 vp4 = vp3 + inner_step
4448 if () goto inner
4450 vp2 = vp1 + step
4451 if () goto LOOP */
4453 /* (2) Calculate the initial address of the aggregate-pointer, and set
4454 the aggregate-pointer to point to it before the loop. */
4456 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4458 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4459 offset, loop, byte_offset);
4460 if (new_stmt_list)
4462 if (pe)
4464 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4465 gcc_assert (!new_bb);
4467 else
4468 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4471 *initial_address = new_temp;
4472 aggr_ptr_init = new_temp;
4474 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4475 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4476 inner-loop nested in LOOP (during outer-loop vectorization). */
4478 /* No update in loop is required. */
4479 if (only_init && (!loop_vinfo || at_loop == loop))
4480 aptr = aggr_ptr_init;
4481 else
4483 /* The step of the aggregate pointer is the type size. */
4484 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4485 /* One exception to the above is when the scalar step of the load in
4486 LOOP is zero. In this case the step here is also zero. */
4487 if (*inv_p)
4488 iv_step = size_zero_node;
4489 else if (tree_int_cst_sgn (step) == -1)
4490 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4492 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4494 create_iv (aggr_ptr_init,
4495 fold_convert (aggr_ptr_type, iv_step),
4496 aggr_ptr, loop, &incr_gsi, insert_after,
4497 &indx_before_incr, &indx_after_incr);
4498 incr = gsi_stmt (incr_gsi);
4499 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4501 /* Copy the points-to information if it exists. */
4502 if (DR_PTR_INFO (dr))
4504 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4505 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4507 if (ptr_incr)
4508 *ptr_incr = incr;
4510 aptr = indx_before_incr;
4513 if (!nested_in_vect_loop || only_init)
4514 return aptr;
4517 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4518 nested in LOOP, if exists. */
4520 gcc_assert (nested_in_vect_loop);
4521 if (!only_init)
4523 standard_iv_increment_position (containing_loop, &incr_gsi,
4524 &insert_after);
4525 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4526 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4527 &indx_after_incr);
4528 incr = gsi_stmt (incr_gsi);
4529 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4531 /* Copy the points-to information if it exists. */
4532 if (DR_PTR_INFO (dr))
4534 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4535 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4537 if (ptr_incr)
4538 *ptr_incr = incr;
4540 return indx_before_incr;
4542 else
4543 gcc_unreachable ();
4547 /* Function bump_vector_ptr
4549 Increment a pointer (to a vector type) by vector-size. If requested,
4550 i.e. if PTR-INCR is given, then also connect the new increment stmt
4551 to the existing def-use update-chain of the pointer, by modifying
4552 the PTR_INCR as illustrated below:
4554 The pointer def-use update-chain before this function:
4555 DATAREF_PTR = phi (p_0, p_2)
4556 ....
4557 PTR_INCR: p_2 = DATAREF_PTR + step
4559 The pointer def-use update-chain after this function:
4560 DATAREF_PTR = phi (p_0, p_2)
4561 ....
4562 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4563 ....
4564 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4566 Input:
4567 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4568 in the loop.
4569 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4570 the loop. The increment amount across iterations is expected
4571 to be vector_size.
4572 BSI - location where the new update stmt is to be placed.
4573 STMT - the original scalar memory-access stmt that is being vectorized.
4574 BUMP - optional. The offset by which to bump the pointer. If not given,
4575 the offset is assumed to be vector_size.
4577 Output: Return NEW_DATAREF_PTR as illustrated above.
4581 tree
4582 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4583 gimple *stmt, tree bump)
4585 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4586 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4587 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4588 tree update = TYPE_SIZE_UNIT (vectype);
4589 gassign *incr_stmt;
4590 ssa_op_iter iter;
4591 use_operand_p use_p;
4592 tree new_dataref_ptr;
4594 if (bump)
4595 update = bump;
4597 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4598 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4599 else
4600 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4601 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4602 dataref_ptr, update);
4603 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4605 /* Copy the points-to information if it exists. */
4606 if (DR_PTR_INFO (dr))
4608 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4609 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4612 if (!ptr_incr)
4613 return new_dataref_ptr;
4615 /* Update the vector-pointer's cross-iteration increment. */
4616 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4618 tree use = USE_FROM_PTR (use_p);
4620 if (use == dataref_ptr)
4621 SET_USE (use_p, new_dataref_ptr);
4622 else
4623 gcc_assert (tree_int_cst_compare (use, update) == 0);
4626 return new_dataref_ptr;
4630 /* Function vect_create_destination_var.
4632 Create a new temporary of type VECTYPE. */
4634 tree
4635 vect_create_destination_var (tree scalar_dest, tree vectype)
4637 tree vec_dest;
4638 const char *name;
4639 char *new_name;
4640 tree type;
4641 enum vect_var_kind kind;
4643 kind = vectype
4644 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4645 ? vect_mask_var
4646 : vect_simple_var
4647 : vect_scalar_var;
4648 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4650 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4652 name = get_name (scalar_dest);
4653 if (name)
4654 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4655 else
4656 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4657 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4658 free (new_name);
4660 return vec_dest;
4663 /* Function vect_grouped_store_supported.
4665 Returns TRUE if interleave high and interleave low permutations
4666 are supported, and FALSE otherwise. */
4668 bool
4669 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4671 machine_mode mode = TYPE_MODE (vectype);
4673 /* vect_permute_store_chain requires the group size to be equal to 3 or
4674 be a power of two. */
4675 if (count != 3 && exact_log2 (count) == -1)
4677 if (dump_enabled_p ())
4678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4679 "the size of the group of accesses"
4680 " is not a power of 2 or not eqaul to 3\n");
4681 return false;
4684 /* Check that the permutation is supported. */
4685 if (VECTOR_MODE_P (mode))
4687 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4688 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4690 if (count == 3)
4692 unsigned int j0 = 0, j1 = 0, j2 = 0;
4693 unsigned int i, j;
4695 for (j = 0; j < 3; j++)
4697 int nelt0 = ((3 - j) * nelt) % 3;
4698 int nelt1 = ((3 - j) * nelt + 1) % 3;
4699 int nelt2 = ((3 - j) * nelt + 2) % 3;
4700 for (i = 0; i < nelt; i++)
4702 if (3 * i + nelt0 < nelt)
4703 sel[3 * i + nelt0] = j0++;
4704 if (3 * i + nelt1 < nelt)
4705 sel[3 * i + nelt1] = nelt + j1++;
4706 if (3 * i + nelt2 < nelt)
4707 sel[3 * i + nelt2] = 0;
4709 if (!can_vec_perm_p (mode, false, sel))
4711 if (dump_enabled_p ())
4712 dump_printf (MSG_MISSED_OPTIMIZATION,
4713 "permutaion op not supported by target.\n");
4714 return false;
4717 for (i = 0; i < nelt; i++)
4719 if (3 * i + nelt0 < nelt)
4720 sel[3 * i + nelt0] = 3 * i + nelt0;
4721 if (3 * i + nelt1 < nelt)
4722 sel[3 * i + nelt1] = 3 * i + nelt1;
4723 if (3 * i + nelt2 < nelt)
4724 sel[3 * i + nelt2] = nelt + j2++;
4726 if (!can_vec_perm_p (mode, false, sel))
4728 if (dump_enabled_p ())
4729 dump_printf (MSG_MISSED_OPTIMIZATION,
4730 "permutaion op not supported by target.\n");
4731 return false;
4734 return true;
4736 else
4738 /* If length is not equal to 3 then only power of 2 is supported. */
4739 gcc_assert (exact_log2 (count) != -1);
4741 for (i = 0; i < nelt / 2; i++)
4743 sel[i * 2] = i;
4744 sel[i * 2 + 1] = i + nelt;
4746 if (can_vec_perm_p (mode, false, sel))
4748 for (i = 0; i < nelt; i++)
4749 sel[i] += nelt / 2;
4750 if (can_vec_perm_p (mode, false, sel))
4751 return true;
4756 if (dump_enabled_p ())
4757 dump_printf (MSG_MISSED_OPTIMIZATION,
4758 "permutaion op not supported by target.\n");
4759 return false;
4763 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4764 type VECTYPE. */
4766 bool
4767 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4769 return vect_lanes_optab_supported_p ("vec_store_lanes",
4770 vec_store_lanes_optab,
4771 vectype, count);
4775 /* Function vect_permute_store_chain.
4777 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4778 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4779 the data correctly for the stores. Return the final references for stores
4780 in RESULT_CHAIN.
4782 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4783 The input is 4 vectors each containing 8 elements. We assign a number to
4784 each element, the input sequence is:
4786 1st vec: 0 1 2 3 4 5 6 7
4787 2nd vec: 8 9 10 11 12 13 14 15
4788 3rd vec: 16 17 18 19 20 21 22 23
4789 4th vec: 24 25 26 27 28 29 30 31
4791 The output sequence should be:
4793 1st vec: 0 8 16 24 1 9 17 25
4794 2nd vec: 2 10 18 26 3 11 19 27
4795 3rd vec: 4 12 20 28 5 13 21 30
4796 4th vec: 6 14 22 30 7 15 23 31
4798 i.e., we interleave the contents of the four vectors in their order.
4800 We use interleave_high/low instructions to create such output. The input of
4801 each interleave_high/low operation is two vectors:
4802 1st vec 2nd vec
4803 0 1 2 3 4 5 6 7
4804 the even elements of the result vector are obtained left-to-right from the
4805 high/low elements of the first vector. The odd elements of the result are
4806 obtained left-to-right from the high/low elements of the second vector.
4807 The output of interleave_high will be: 0 4 1 5
4808 and of interleave_low: 2 6 3 7
4811 The permutation is done in log LENGTH stages. In each stage interleave_high
4812 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4813 where the first argument is taken from the first half of DR_CHAIN and the
4814 second argument from it's second half.
4815 In our example,
4817 I1: interleave_high (1st vec, 3rd vec)
4818 I2: interleave_low (1st vec, 3rd vec)
4819 I3: interleave_high (2nd vec, 4th vec)
4820 I4: interleave_low (2nd vec, 4th vec)
4822 The output for the first stage is:
4824 I1: 0 16 1 17 2 18 3 19
4825 I2: 4 20 5 21 6 22 7 23
4826 I3: 8 24 9 25 10 26 11 27
4827 I4: 12 28 13 29 14 30 15 31
4829 The output of the second stage, i.e. the final result is:
4831 I1: 0 8 16 24 1 9 17 25
4832 I2: 2 10 18 26 3 11 19 27
4833 I3: 4 12 20 28 5 13 21 30
4834 I4: 6 14 22 30 7 15 23 31. */
4836 void
4837 vect_permute_store_chain (vec<tree> dr_chain,
4838 unsigned int length,
4839 gimple *stmt,
4840 gimple_stmt_iterator *gsi,
4841 vec<tree> *result_chain)
4843 tree vect1, vect2, high, low;
4844 gimple *perm_stmt;
4845 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4846 tree perm_mask_low, perm_mask_high;
4847 tree data_ref;
4848 tree perm3_mask_low, perm3_mask_high;
4849 unsigned int i, n, log_length = exact_log2 (length);
4850 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4851 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4853 result_chain->quick_grow (length);
4854 memcpy (result_chain->address (), dr_chain.address (),
4855 length * sizeof (tree));
4857 if (length == 3)
4859 unsigned int j0 = 0, j1 = 0, j2 = 0;
4861 for (j = 0; j < 3; j++)
4863 int nelt0 = ((3 - j) * nelt) % 3;
4864 int nelt1 = ((3 - j) * nelt + 1) % 3;
4865 int nelt2 = ((3 - j) * nelt + 2) % 3;
4867 for (i = 0; i < nelt; i++)
4869 if (3 * i + nelt0 < nelt)
4870 sel[3 * i + nelt0] = j0++;
4871 if (3 * i + nelt1 < nelt)
4872 sel[3 * i + nelt1] = nelt + j1++;
4873 if (3 * i + nelt2 < nelt)
4874 sel[3 * i + nelt2] = 0;
4876 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4878 for (i = 0; i < nelt; i++)
4880 if (3 * i + nelt0 < nelt)
4881 sel[3 * i + nelt0] = 3 * i + nelt0;
4882 if (3 * i + nelt1 < nelt)
4883 sel[3 * i + nelt1] = 3 * i + nelt1;
4884 if (3 * i + nelt2 < nelt)
4885 sel[3 * i + nelt2] = nelt + j2++;
4887 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4889 vect1 = dr_chain[0];
4890 vect2 = dr_chain[1];
4892 /* Create interleaving stmt:
4893 low = VEC_PERM_EXPR <vect1, vect2,
4894 {j, nelt, *, j + 1, nelt + j + 1, *,
4895 j + 2, nelt + j + 2, *, ...}> */
4896 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4897 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4898 vect2, perm3_mask_low);
4899 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4901 vect1 = data_ref;
4902 vect2 = dr_chain[2];
4903 /* Create interleaving stmt:
4904 low = VEC_PERM_EXPR <vect1, vect2,
4905 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4906 6, 7, nelt + j + 2, ...}> */
4907 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4908 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4909 vect2, perm3_mask_high);
4910 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4911 (*result_chain)[j] = data_ref;
4914 else
4916 /* If length is not equal to 3 then only power of 2 is supported. */
4917 gcc_assert (exact_log2 (length) != -1);
4919 for (i = 0, n = nelt / 2; i < n; i++)
4921 sel[i * 2] = i;
4922 sel[i * 2 + 1] = i + nelt;
4924 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4926 for (i = 0; i < nelt; i++)
4927 sel[i] += nelt / 2;
4928 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4930 for (i = 0, n = log_length; i < n; i++)
4932 for (j = 0; j < length/2; j++)
4934 vect1 = dr_chain[j];
4935 vect2 = dr_chain[j+length/2];
4937 /* Create interleaving stmt:
4938 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4939 ...}> */
4940 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4941 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4942 vect2, perm_mask_high);
4943 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4944 (*result_chain)[2*j] = high;
4946 /* Create interleaving stmt:
4947 low = VEC_PERM_EXPR <vect1, vect2,
4948 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4949 ...}> */
4950 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4951 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4952 vect2, perm_mask_low);
4953 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4954 (*result_chain)[2*j+1] = low;
4956 memcpy (dr_chain.address (), result_chain->address (),
4957 length * sizeof (tree));
4962 /* Function vect_setup_realignment
4964 This function is called when vectorizing an unaligned load using
4965 the dr_explicit_realign[_optimized] scheme.
4966 This function generates the following code at the loop prolog:
4968 p = initial_addr;
4969 x msq_init = *(floor(p)); # prolog load
4970 realignment_token = call target_builtin;
4971 loop:
4972 x msq = phi (msq_init, ---)
4974 The stmts marked with x are generated only for the case of
4975 dr_explicit_realign_optimized.
4977 The code above sets up a new (vector) pointer, pointing to the first
4978 location accessed by STMT, and a "floor-aligned" load using that pointer.
4979 It also generates code to compute the "realignment-token" (if the relevant
4980 target hook was defined), and creates a phi-node at the loop-header bb
4981 whose arguments are the result of the prolog-load (created by this
4982 function) and the result of a load that takes place in the loop (to be
4983 created by the caller to this function).
4985 For the case of dr_explicit_realign_optimized:
4986 The caller to this function uses the phi-result (msq) to create the
4987 realignment code inside the loop, and sets up the missing phi argument,
4988 as follows:
4989 loop:
4990 msq = phi (msq_init, lsq)
4991 lsq = *(floor(p')); # load in loop
4992 result = realign_load (msq, lsq, realignment_token);
4994 For the case of dr_explicit_realign:
4995 loop:
4996 msq = *(floor(p)); # load in loop
4997 p' = p + (VS-1);
4998 lsq = *(floor(p')); # load in loop
4999 result = realign_load (msq, lsq, realignment_token);
5001 Input:
5002 STMT - (scalar) load stmt to be vectorized. This load accesses
5003 a memory location that may be unaligned.
5004 BSI - place where new code is to be inserted.
5005 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5006 is used.
5008 Output:
5009 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5010 target hook, if defined.
5011 Return value - the result of the loop-header phi node. */
5013 tree
5014 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5015 tree *realignment_token,
5016 enum dr_alignment_support alignment_support_scheme,
5017 tree init_addr,
5018 struct loop **at_loop)
5020 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5021 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5022 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5023 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5024 struct loop *loop = NULL;
5025 edge pe = NULL;
5026 tree scalar_dest = gimple_assign_lhs (stmt);
5027 tree vec_dest;
5028 gimple *inc;
5029 tree ptr;
5030 tree data_ref;
5031 basic_block new_bb;
5032 tree msq_init = NULL_TREE;
5033 tree new_temp;
5034 gphi *phi_stmt;
5035 tree msq = NULL_TREE;
5036 gimple_seq stmts = NULL;
5037 bool inv_p;
5038 bool compute_in_loop = false;
5039 bool nested_in_vect_loop = false;
5040 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5041 struct loop *loop_for_initial_load = NULL;
5043 if (loop_vinfo)
5045 loop = LOOP_VINFO_LOOP (loop_vinfo);
5046 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5049 gcc_assert (alignment_support_scheme == dr_explicit_realign
5050 || alignment_support_scheme == dr_explicit_realign_optimized);
5052 /* We need to generate three things:
5053 1. the misalignment computation
5054 2. the extra vector load (for the optimized realignment scheme).
5055 3. the phi node for the two vectors from which the realignment is
5056 done (for the optimized realignment scheme). */
5058 /* 1. Determine where to generate the misalignment computation.
5060 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5061 calculation will be generated by this function, outside the loop (in the
5062 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5063 caller, inside the loop.
5065 Background: If the misalignment remains fixed throughout the iterations of
5066 the loop, then both realignment schemes are applicable, and also the
5067 misalignment computation can be done outside LOOP. This is because we are
5068 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5069 are a multiple of VS (the Vector Size), and therefore the misalignment in
5070 different vectorized LOOP iterations is always the same.
5071 The problem arises only if the memory access is in an inner-loop nested
5072 inside LOOP, which is now being vectorized using outer-loop vectorization.
5073 This is the only case when the misalignment of the memory access may not
5074 remain fixed throughout the iterations of the inner-loop (as explained in
5075 detail in vect_supportable_dr_alignment). In this case, not only is the
5076 optimized realignment scheme not applicable, but also the misalignment
5077 computation (and generation of the realignment token that is passed to
5078 REALIGN_LOAD) have to be done inside the loop.
5080 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5081 or not, which in turn determines if the misalignment is computed inside
5082 the inner-loop, or outside LOOP. */
5084 if (init_addr != NULL_TREE || !loop_vinfo)
5086 compute_in_loop = true;
5087 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5091 /* 2. Determine where to generate the extra vector load.
5093 For the optimized realignment scheme, instead of generating two vector
5094 loads in each iteration, we generate a single extra vector load in the
5095 preheader of the loop, and in each iteration reuse the result of the
5096 vector load from the previous iteration. In case the memory access is in
5097 an inner-loop nested inside LOOP, which is now being vectorized using
5098 outer-loop vectorization, we need to determine whether this initial vector
5099 load should be generated at the preheader of the inner-loop, or can be
5100 generated at the preheader of LOOP. If the memory access has no evolution
5101 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5102 to be generated inside LOOP (in the preheader of the inner-loop). */
5104 if (nested_in_vect_loop)
5106 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5107 bool invariant_in_outerloop =
5108 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5109 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5111 else
5112 loop_for_initial_load = loop;
5113 if (at_loop)
5114 *at_loop = loop_for_initial_load;
5116 if (loop_for_initial_load)
5117 pe = loop_preheader_edge (loop_for_initial_load);
5119 /* 3. For the case of the optimized realignment, create the first vector
5120 load at the loop preheader. */
5122 if (alignment_support_scheme == dr_explicit_realign_optimized)
5124 /* Create msq_init = *(floor(p1)) in the loop preheader */
5125 gassign *new_stmt;
5127 gcc_assert (!compute_in_loop);
5128 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5129 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5130 NULL_TREE, &init_addr, NULL, &inc,
5131 true, &inv_p);
5132 if (TREE_CODE (ptr) == SSA_NAME)
5133 new_temp = copy_ssa_name (ptr);
5134 else
5135 new_temp = make_ssa_name (TREE_TYPE (ptr));
5136 new_stmt = gimple_build_assign
5137 (new_temp, BIT_AND_EXPR, ptr,
5138 build_int_cst (TREE_TYPE (ptr),
5139 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5140 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5141 gcc_assert (!new_bb);
5142 data_ref
5143 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5144 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5145 new_stmt = gimple_build_assign (vec_dest, data_ref);
5146 new_temp = make_ssa_name (vec_dest, new_stmt);
5147 gimple_assign_set_lhs (new_stmt, new_temp);
5148 if (pe)
5150 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5151 gcc_assert (!new_bb);
5153 else
5154 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5156 msq_init = gimple_assign_lhs (new_stmt);
5159 /* 4. Create realignment token using a target builtin, if available.
5160 It is done either inside the containing loop, or before LOOP (as
5161 determined above). */
5163 if (targetm.vectorize.builtin_mask_for_load)
5165 gcall *new_stmt;
5166 tree builtin_decl;
5168 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5169 if (!init_addr)
5171 /* Generate the INIT_ADDR computation outside LOOP. */
5172 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5173 NULL_TREE, loop);
5174 if (loop)
5176 pe = loop_preheader_edge (loop);
5177 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5178 gcc_assert (!new_bb);
5180 else
5181 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5184 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5185 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5186 vec_dest =
5187 vect_create_destination_var (scalar_dest,
5188 gimple_call_return_type (new_stmt));
5189 new_temp = make_ssa_name (vec_dest, new_stmt);
5190 gimple_call_set_lhs (new_stmt, new_temp);
5192 if (compute_in_loop)
5193 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5194 else
5196 /* Generate the misalignment computation outside LOOP. */
5197 pe = loop_preheader_edge (loop);
5198 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5199 gcc_assert (!new_bb);
5202 *realignment_token = gimple_call_lhs (new_stmt);
5204 /* The result of the CALL_EXPR to this builtin is determined from
5205 the value of the parameter and no global variables are touched
5206 which makes the builtin a "const" function. Requiring the
5207 builtin to have the "const" attribute makes it unnecessary
5208 to call mark_call_clobbered. */
5209 gcc_assert (TREE_READONLY (builtin_decl));
5212 if (alignment_support_scheme == dr_explicit_realign)
5213 return msq;
5215 gcc_assert (!compute_in_loop);
5216 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5219 /* 5. Create msq = phi <msq_init, lsq> in loop */
5221 pe = loop_preheader_edge (containing_loop);
5222 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5223 msq = make_ssa_name (vec_dest);
5224 phi_stmt = create_phi_node (msq, containing_loop->header);
5225 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5227 return msq;
5231 /* Function vect_grouped_load_supported.
5233 COUNT is the size of the load group (the number of statements plus the
5234 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5235 only one statement, with a gap of COUNT - 1.
5237 Returns true if a suitable permute exists. */
5239 bool
5240 vect_grouped_load_supported (tree vectype, bool single_element_p,
5241 unsigned HOST_WIDE_INT count)
5243 machine_mode mode = TYPE_MODE (vectype);
5245 /* If this is single-element interleaving with an element distance
5246 that leaves unused vector loads around punt - we at least create
5247 very sub-optimal code in that case (and blow up memory,
5248 see PR65518). */
5249 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5251 if (dump_enabled_p ())
5252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5253 "single-element interleaving not supported "
5254 "for not adjacent vector loads\n");
5255 return false;
5258 /* vect_permute_load_chain requires the group size to be equal to 3 or
5259 be a power of two. */
5260 if (count != 3 && exact_log2 (count) == -1)
5262 if (dump_enabled_p ())
5263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5264 "the size of the group of accesses"
5265 " is not a power of 2 or not equal to 3\n");
5266 return false;
5269 /* Check that the permutation is supported. */
5270 if (VECTOR_MODE_P (mode))
5272 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5273 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5275 if (count == 3)
5277 unsigned int k;
5278 for (k = 0; k < 3; k++)
5280 for (i = 0; i < nelt; i++)
5281 if (3 * i + k < 2 * nelt)
5282 sel[i] = 3 * i + k;
5283 else
5284 sel[i] = 0;
5285 if (!can_vec_perm_p (mode, false, sel))
5287 if (dump_enabled_p ())
5288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5289 "shuffle of 3 loads is not supported by"
5290 " target\n");
5291 return false;
5293 for (i = 0, j = 0; i < nelt; i++)
5294 if (3 * i + k < 2 * nelt)
5295 sel[i] = i;
5296 else
5297 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5298 if (!can_vec_perm_p (mode, false, sel))
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5302 "shuffle of 3 loads is not supported by"
5303 " target\n");
5304 return false;
5307 return true;
5309 else
5311 /* If length is not equal to 3 then only power of 2 is supported. */
5312 gcc_assert (exact_log2 (count) != -1);
5313 for (i = 0; i < nelt; i++)
5314 sel[i] = i * 2;
5315 if (can_vec_perm_p (mode, false, sel))
5317 for (i = 0; i < nelt; i++)
5318 sel[i] = i * 2 + 1;
5319 if (can_vec_perm_p (mode, false, sel))
5320 return true;
5325 if (dump_enabled_p ())
5326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5327 "extract even/odd not supported by target\n");
5328 return false;
5331 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5332 type VECTYPE. */
5334 bool
5335 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5337 return vect_lanes_optab_supported_p ("vec_load_lanes",
5338 vec_load_lanes_optab,
5339 vectype, count);
5342 /* Function vect_permute_load_chain.
5344 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5345 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5346 the input data correctly. Return the final references for loads in
5347 RESULT_CHAIN.
5349 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5350 The input is 4 vectors each containing 8 elements. We assign a number to each
5351 element, the input sequence is:
5353 1st vec: 0 1 2 3 4 5 6 7
5354 2nd vec: 8 9 10 11 12 13 14 15
5355 3rd vec: 16 17 18 19 20 21 22 23
5356 4th vec: 24 25 26 27 28 29 30 31
5358 The output sequence should be:
5360 1st vec: 0 4 8 12 16 20 24 28
5361 2nd vec: 1 5 9 13 17 21 25 29
5362 3rd vec: 2 6 10 14 18 22 26 30
5363 4th vec: 3 7 11 15 19 23 27 31
5365 i.e., the first output vector should contain the first elements of each
5366 interleaving group, etc.
5368 We use extract_even/odd instructions to create such output. The input of
5369 each extract_even/odd operation is two vectors
5370 1st vec 2nd vec
5371 0 1 2 3 4 5 6 7
5373 and the output is the vector of extracted even/odd elements. The output of
5374 extract_even will be: 0 2 4 6
5375 and of extract_odd: 1 3 5 7
5378 The permutation is done in log LENGTH stages. In each stage extract_even
5379 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5380 their order. In our example,
5382 E1: extract_even (1st vec, 2nd vec)
5383 E2: extract_odd (1st vec, 2nd vec)
5384 E3: extract_even (3rd vec, 4th vec)
5385 E4: extract_odd (3rd vec, 4th vec)
5387 The output for the first stage will be:
5389 E1: 0 2 4 6 8 10 12 14
5390 E2: 1 3 5 7 9 11 13 15
5391 E3: 16 18 20 22 24 26 28 30
5392 E4: 17 19 21 23 25 27 29 31
5394 In order to proceed and create the correct sequence for the next stage (or
5395 for the correct output, if the second stage is the last one, as in our
5396 example), we first put the output of extract_even operation and then the
5397 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5398 The input for the second stage is:
5400 1st vec (E1): 0 2 4 6 8 10 12 14
5401 2nd vec (E3): 16 18 20 22 24 26 28 30
5402 3rd vec (E2): 1 3 5 7 9 11 13 15
5403 4th vec (E4): 17 19 21 23 25 27 29 31
5405 The output of the second stage:
5407 E1: 0 4 8 12 16 20 24 28
5408 E2: 2 6 10 14 18 22 26 30
5409 E3: 1 5 9 13 17 21 25 29
5410 E4: 3 7 11 15 19 23 27 31
5412 And RESULT_CHAIN after reordering:
5414 1st vec (E1): 0 4 8 12 16 20 24 28
5415 2nd vec (E3): 1 5 9 13 17 21 25 29
5416 3rd vec (E2): 2 6 10 14 18 22 26 30
5417 4th vec (E4): 3 7 11 15 19 23 27 31. */
5419 static void
5420 vect_permute_load_chain (vec<tree> dr_chain,
5421 unsigned int length,
5422 gimple *stmt,
5423 gimple_stmt_iterator *gsi,
5424 vec<tree> *result_chain)
5426 tree data_ref, first_vect, second_vect;
5427 tree perm_mask_even, perm_mask_odd;
5428 tree perm3_mask_low, perm3_mask_high;
5429 gimple *perm_stmt;
5430 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5431 unsigned int i, j, log_length = exact_log2 (length);
5432 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5433 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5435 result_chain->quick_grow (length);
5436 memcpy (result_chain->address (), dr_chain.address (),
5437 length * sizeof (tree));
5439 if (length == 3)
5441 unsigned int k;
5443 for (k = 0; k < 3; k++)
5445 for (i = 0; i < nelt; i++)
5446 if (3 * i + k < 2 * nelt)
5447 sel[i] = 3 * i + k;
5448 else
5449 sel[i] = 0;
5450 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5452 for (i = 0, j = 0; i < nelt; i++)
5453 if (3 * i + k < 2 * nelt)
5454 sel[i] = i;
5455 else
5456 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5458 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5460 first_vect = dr_chain[0];
5461 second_vect = dr_chain[1];
5463 /* Create interleaving stmt (low part of):
5464 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5465 ...}> */
5466 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5467 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5468 second_vect, perm3_mask_low);
5469 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5471 /* Create interleaving stmt (high part of):
5472 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5473 ...}> */
5474 first_vect = data_ref;
5475 second_vect = dr_chain[2];
5476 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5477 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5478 second_vect, perm3_mask_high);
5479 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5480 (*result_chain)[k] = data_ref;
5483 else
5485 /* If length is not equal to 3 then only power of 2 is supported. */
5486 gcc_assert (exact_log2 (length) != -1);
5488 for (i = 0; i < nelt; ++i)
5489 sel[i] = i * 2;
5490 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5492 for (i = 0; i < nelt; ++i)
5493 sel[i] = i * 2 + 1;
5494 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5496 for (i = 0; i < log_length; i++)
5498 for (j = 0; j < length; j += 2)
5500 first_vect = dr_chain[j];
5501 second_vect = dr_chain[j+1];
5503 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5504 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5505 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5506 first_vect, second_vect,
5507 perm_mask_even);
5508 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5509 (*result_chain)[j/2] = data_ref;
5511 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5512 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5513 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5514 first_vect, second_vect,
5515 perm_mask_odd);
5516 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5517 (*result_chain)[j/2+length/2] = data_ref;
5519 memcpy (dr_chain.address (), result_chain->address (),
5520 length * sizeof (tree));
5525 /* Function vect_shift_permute_load_chain.
5527 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5528 sequence of stmts to reorder the input data accordingly.
5529 Return the final references for loads in RESULT_CHAIN.
5530 Return true if successed, false otherwise.
5532 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5533 The input is 3 vectors each containing 8 elements. We assign a
5534 number to each element, the input sequence is:
5536 1st vec: 0 1 2 3 4 5 6 7
5537 2nd vec: 8 9 10 11 12 13 14 15
5538 3rd vec: 16 17 18 19 20 21 22 23
5540 The output sequence should be:
5542 1st vec: 0 3 6 9 12 15 18 21
5543 2nd vec: 1 4 7 10 13 16 19 22
5544 3rd vec: 2 5 8 11 14 17 20 23
5546 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5548 First we shuffle all 3 vectors to get correct elements order:
5550 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5551 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5552 3rd vec: (16 19 22) (17 20 23) (18 21)
5554 Next we unite and shift vector 3 times:
5556 1st step:
5557 shift right by 6 the concatenation of:
5558 "1st vec" and "2nd vec"
5559 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5560 "2nd vec" and "3rd vec"
5561 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5562 "3rd vec" and "1st vec"
5563 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5564 | New vectors |
5566 So that now new vectors are:
5568 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5569 2nd vec: (10 13) (16 19 22) (17 20 23)
5570 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5572 2nd step:
5573 shift right by 5 the concatenation of:
5574 "1st vec" and "3rd vec"
5575 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5576 "2nd vec" and "1st vec"
5577 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5578 "3rd vec" and "2nd vec"
5579 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5580 | New vectors |
5582 So that now new vectors are:
5584 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5585 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5586 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5588 3rd step:
5589 shift right by 5 the concatenation of:
5590 "1st vec" and "1st vec"
5591 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5592 shift right by 3 the concatenation of:
5593 "2nd vec" and "2nd vec"
5594 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5595 | New vectors |
5597 So that now all vectors are READY:
5598 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5599 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5600 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5602 This algorithm is faster than one in vect_permute_load_chain if:
5603 1. "shift of a concatination" is faster than general permutation.
5604 This is usually so.
5605 2. The TARGET machine can't execute vector instructions in parallel.
5606 This is because each step of the algorithm depends on previous.
5607 The algorithm in vect_permute_load_chain is much more parallel.
5609 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5612 static bool
5613 vect_shift_permute_load_chain (vec<tree> dr_chain,
5614 unsigned int length,
5615 gimple *stmt,
5616 gimple_stmt_iterator *gsi,
5617 vec<tree> *result_chain)
5619 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5620 tree perm2_mask1, perm2_mask2, perm3_mask;
5621 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5622 gimple *perm_stmt;
5624 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5625 unsigned int i;
5626 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5627 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5628 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5629 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5631 result_chain->quick_grow (length);
5632 memcpy (result_chain->address (), dr_chain.address (),
5633 length * sizeof (tree));
5635 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5637 unsigned int j, log_length = exact_log2 (length);
5638 for (i = 0; i < nelt / 2; ++i)
5639 sel[i] = i * 2;
5640 for (i = 0; i < nelt / 2; ++i)
5641 sel[nelt / 2 + i] = i * 2 + 1;
5642 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5644 if (dump_enabled_p ())
5645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5646 "shuffle of 2 fields structure is not \
5647 supported by target\n");
5648 return false;
5650 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5652 for (i = 0; i < nelt / 2; ++i)
5653 sel[i] = i * 2 + 1;
5654 for (i = 0; i < nelt / 2; ++i)
5655 sel[nelt / 2 + i] = i * 2;
5656 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5658 if (dump_enabled_p ())
5659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5660 "shuffle of 2 fields structure is not \
5661 supported by target\n");
5662 return false;
5664 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5666 /* Generating permutation constant to shift all elements.
5667 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5668 for (i = 0; i < nelt; i++)
5669 sel[i] = nelt / 2 + i;
5670 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5672 if (dump_enabled_p ())
5673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5674 "shift permutation is not supported by target\n");
5675 return false;
5677 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5679 /* Generating permutation constant to select vector from 2.
5680 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5681 for (i = 0; i < nelt / 2; i++)
5682 sel[i] = i;
5683 for (i = nelt / 2; i < nelt; i++)
5684 sel[i] = nelt + i;
5685 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5687 if (dump_enabled_p ())
5688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5689 "select is not supported by target\n");
5690 return false;
5692 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5694 for (i = 0; i < log_length; i++)
5696 for (j = 0; j < length; j += 2)
5698 first_vect = dr_chain[j];
5699 second_vect = dr_chain[j + 1];
5701 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5702 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5703 first_vect, first_vect,
5704 perm2_mask1);
5705 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5706 vect[0] = data_ref;
5708 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5709 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5710 second_vect, second_vect,
5711 perm2_mask2);
5712 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5713 vect[1] = data_ref;
5715 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5716 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5717 vect[0], vect[1], shift1_mask);
5718 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5719 (*result_chain)[j/2 + length/2] = data_ref;
5721 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5722 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5723 vect[0], vect[1], select_mask);
5724 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5725 (*result_chain)[j/2] = data_ref;
5727 memcpy (dr_chain.address (), result_chain->address (),
5728 length * sizeof (tree));
5730 return true;
5732 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5734 unsigned int k = 0, l = 0;
5736 /* Generating permutation constant to get all elements in rigth order.
5737 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5738 for (i = 0; i < nelt; i++)
5740 if (3 * k + (l % 3) >= nelt)
5742 k = 0;
5743 l += (3 - (nelt % 3));
5745 sel[i] = 3 * k + (l % 3);
5746 k++;
5748 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5750 if (dump_enabled_p ())
5751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5752 "shuffle of 3 fields structure is not \
5753 supported by target\n");
5754 return false;
5756 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5758 /* Generating permutation constant to shift all elements.
5759 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5760 for (i = 0; i < nelt; i++)
5761 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5762 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5764 if (dump_enabled_p ())
5765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766 "shift permutation is not supported by target\n");
5767 return false;
5769 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5771 /* Generating permutation constant to shift all elements.
5772 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5773 for (i = 0; i < nelt; i++)
5774 sel[i] = 2 * (nelt / 3) + 1 + i;
5775 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5777 if (dump_enabled_p ())
5778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5779 "shift permutation is not supported by target\n");
5780 return false;
5782 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5784 /* Generating permutation constant to shift all elements.
5785 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5786 for (i = 0; i < nelt; i++)
5787 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5788 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5790 if (dump_enabled_p ())
5791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5792 "shift permutation is not supported by target\n");
5793 return false;
5795 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5797 /* Generating permutation constant to shift all elements.
5798 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5799 for (i = 0; i < nelt; i++)
5800 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5801 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5803 if (dump_enabled_p ())
5804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805 "shift permutation is not supported by target\n");
5806 return false;
5808 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5810 for (k = 0; k < 3; k++)
5812 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5813 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5814 dr_chain[k], dr_chain[k],
5815 perm3_mask);
5816 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5817 vect[k] = data_ref;
5820 for (k = 0; k < 3; k++)
5822 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5823 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5824 vect[k % 3], vect[(k + 1) % 3],
5825 shift1_mask);
5826 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5827 vect_shift[k] = data_ref;
5830 for (k = 0; k < 3; k++)
5832 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5833 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5834 vect_shift[(4 - k) % 3],
5835 vect_shift[(3 - k) % 3],
5836 shift2_mask);
5837 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5838 vect[k] = data_ref;
5841 (*result_chain)[3 - (nelt % 3)] = vect[2];
5843 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5844 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5845 vect[0], shift3_mask);
5846 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5847 (*result_chain)[nelt % 3] = data_ref;
5849 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5850 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5851 vect[1], shift4_mask);
5852 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5853 (*result_chain)[0] = data_ref;
5854 return true;
5856 return false;
5859 /* Function vect_transform_grouped_load.
5861 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5862 to perform their permutation and ascribe the result vectorized statements to
5863 the scalar statements.
5866 void
5867 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5868 gimple_stmt_iterator *gsi)
5870 machine_mode mode;
5871 vec<tree> result_chain = vNULL;
5873 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5874 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5875 vectors, that are ready for vector computation. */
5876 result_chain.create (size);
5878 /* If reassociation width for vector type is 2 or greater target machine can
5879 execute 2 or more vector instructions in parallel. Otherwise try to
5880 get chain for loads group using vect_shift_permute_load_chain. */
5881 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5882 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5883 || exact_log2 (size) != -1
5884 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5885 gsi, &result_chain))
5886 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5887 vect_record_grouped_load_vectors (stmt, result_chain);
5888 result_chain.release ();
5891 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5892 generated as part of the vectorization of STMT. Assign the statement
5893 for each vector to the associated scalar statement. */
5895 void
5896 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5898 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5899 gimple *next_stmt, *new_stmt;
5900 unsigned int i, gap_count;
5901 tree tmp_data_ref;
5903 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5904 Since we scan the chain starting from it's first node, their order
5905 corresponds the order of data-refs in RESULT_CHAIN. */
5906 next_stmt = first_stmt;
5907 gap_count = 1;
5908 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5910 if (!next_stmt)
5911 break;
5913 /* Skip the gaps. Loads created for the gaps will be removed by dead
5914 code elimination pass later. No need to check for the first stmt in
5915 the group, since it always exists.
5916 GROUP_GAP is the number of steps in elements from the previous
5917 access (if there is no gap GROUP_GAP is 1). We skip loads that
5918 correspond to the gaps. */
5919 if (next_stmt != first_stmt
5920 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5922 gap_count++;
5923 continue;
5926 while (next_stmt)
5928 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5929 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5930 copies, and we put the new vector statement in the first available
5931 RELATED_STMT. */
5932 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5933 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5934 else
5936 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5938 gimple *prev_stmt =
5939 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5940 gimple *rel_stmt =
5941 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5942 while (rel_stmt)
5944 prev_stmt = rel_stmt;
5945 rel_stmt =
5946 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5949 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5950 new_stmt;
5954 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5955 gap_count = 1;
5956 /* If NEXT_STMT accesses the same DR as the previous statement,
5957 put the same TMP_DATA_REF as its vectorized statement; otherwise
5958 get the next data-ref from RESULT_CHAIN. */
5959 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5960 break;
5965 /* Function vect_force_dr_alignment_p.
5967 Returns whether the alignment of a DECL can be forced to be aligned
5968 on ALIGNMENT bit boundary. */
5970 bool
5971 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5973 if (TREE_CODE (decl) != VAR_DECL)
5974 return false;
5976 if (decl_in_symtab_p (decl)
5977 && !symtab_node::get (decl)->can_increase_alignment_p ())
5978 return false;
5980 if (TREE_STATIC (decl))
5981 return (alignment <= MAX_OFILE_ALIGNMENT);
5982 else
5983 return (alignment <= MAX_STACK_ALIGNMENT);
5987 /* Return whether the data reference DR is supported with respect to its
5988 alignment.
5989 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5990 it is aligned, i.e., check if it is possible to vectorize it with different
5991 alignment. */
5993 enum dr_alignment_support
5994 vect_supportable_dr_alignment (struct data_reference *dr,
5995 bool check_aligned_accesses)
5997 gimple *stmt = DR_STMT (dr);
5998 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5999 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6000 machine_mode mode = TYPE_MODE (vectype);
6001 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6002 struct loop *vect_loop = NULL;
6003 bool nested_in_vect_loop = false;
6005 if (aligned_access_p (dr) && !check_aligned_accesses)
6006 return dr_aligned;
6008 /* For now assume all conditional loads/stores support unaligned
6009 access without any special code. */
6010 if (is_gimple_call (stmt)
6011 && gimple_call_internal_p (stmt)
6012 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6013 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6014 return dr_unaligned_supported;
6016 if (loop_vinfo)
6018 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6019 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6022 /* Possibly unaligned access. */
6024 /* We can choose between using the implicit realignment scheme (generating
6025 a misaligned_move stmt) and the explicit realignment scheme (generating
6026 aligned loads with a REALIGN_LOAD). There are two variants to the
6027 explicit realignment scheme: optimized, and unoptimized.
6028 We can optimize the realignment only if the step between consecutive
6029 vector loads is equal to the vector size. Since the vector memory
6030 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6031 is guaranteed that the misalignment amount remains the same throughout the
6032 execution of the vectorized loop. Therefore, we can create the
6033 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6034 at the loop preheader.
6036 However, in the case of outer-loop vectorization, when vectorizing a
6037 memory access in the inner-loop nested within the LOOP that is now being
6038 vectorized, while it is guaranteed that the misalignment of the
6039 vectorized memory access will remain the same in different outer-loop
6040 iterations, it is *not* guaranteed that is will remain the same throughout
6041 the execution of the inner-loop. This is because the inner-loop advances
6042 with the original scalar step (and not in steps of VS). If the inner-loop
6043 step happens to be a multiple of VS, then the misalignment remains fixed
6044 and we can use the optimized realignment scheme. For example:
6046 for (i=0; i<N; i++)
6047 for (j=0; j<M; j++)
6048 s += a[i+j];
6050 When vectorizing the i-loop in the above example, the step between
6051 consecutive vector loads is 1, and so the misalignment does not remain
6052 fixed across the execution of the inner-loop, and the realignment cannot
6053 be optimized (as illustrated in the following pseudo vectorized loop):
6055 for (i=0; i<N; i+=4)
6056 for (j=0; j<M; j++){
6057 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6058 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6059 // (assuming that we start from an aligned address).
6062 We therefore have to use the unoptimized realignment scheme:
6064 for (i=0; i<N; i+=4)
6065 for (j=k; j<M; j+=4)
6066 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6067 // that the misalignment of the initial address is
6068 // 0).
6070 The loop can then be vectorized as follows:
6072 for (k=0; k<4; k++){
6073 rt = get_realignment_token (&vp[k]);
6074 for (i=0; i<N; i+=4){
6075 v1 = vp[i+k];
6076 for (j=k; j<M; j+=4){
6077 v2 = vp[i+j+VS-1];
6078 va = REALIGN_LOAD <v1,v2,rt>;
6079 vs += va;
6080 v1 = v2;
6083 } */
6085 if (DR_IS_READ (dr))
6087 bool is_packed = false;
6088 tree type = (TREE_TYPE (DR_REF (dr)));
6090 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6091 && (!targetm.vectorize.builtin_mask_for_load
6092 || targetm.vectorize.builtin_mask_for_load ()))
6094 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6096 /* If we are doing SLP then the accesses need not have the
6097 same alignment, instead it depends on the SLP group size. */
6098 if (loop_vinfo
6099 && STMT_SLP_TYPE (stmt_info)
6100 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6101 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6102 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6104 else if (!loop_vinfo
6105 || (nested_in_vect_loop
6106 && (TREE_INT_CST_LOW (DR_STEP (dr))
6107 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6108 return dr_explicit_realign;
6109 else
6110 return dr_explicit_realign_optimized;
6112 if (!known_alignment_for_access_p (dr))
6113 is_packed = not_size_aligned (DR_REF (dr));
6115 if ((TYPE_USER_ALIGN (type) && !is_packed)
6116 || targetm.vectorize.
6117 support_vector_misalignment (mode, type,
6118 DR_MISALIGNMENT (dr), is_packed))
6119 /* Can't software pipeline the loads, but can at least do them. */
6120 return dr_unaligned_supported;
6122 else
6124 bool is_packed = false;
6125 tree type = (TREE_TYPE (DR_REF (dr)));
6127 if (!known_alignment_for_access_p (dr))
6128 is_packed = not_size_aligned (DR_REF (dr));
6130 if ((TYPE_USER_ALIGN (type) && !is_packed)
6131 || targetm.vectorize.
6132 support_vector_misalignment (mode, type,
6133 DR_MISALIGNMENT (dr), is_packed))
6134 return dr_unaligned_supported;
6137 /* Unsupported. */
6138 return dr_unaligned_unsupported;