PR middle-end/66867
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob5ac34bebb074d5f4f71aa4e189db40c48f013580
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* We do not have to consider dependences between accesses that belong
231 to the same group. */
232 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
233 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
234 return false;
236 /* Even if we have an anti-dependence then, as the vectorized loop covers at
237 least two scalar iterations, there is always also a true dependence.
238 As the vectorizer does not re-order loads and stores we can ignore
239 the anti-dependence if TBAA can disambiguate both DRs similar to the
240 case with known negative distance anti-dependences (positive
241 distance anti-dependences would violate TBAA constraints). */
242 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
243 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
244 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
245 get_alias_set (DR_REF (drb))))
246 return false;
248 /* Unknown data dependence. */
249 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
251 /* If user asserted safelen consecutive iterations can be
252 executed concurrently, assume independence. */
253 if (loop->safelen >= 2)
255 if (loop->safelen < *max_vf)
256 *max_vf = loop->safelen;
257 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
258 return false;
261 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
262 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
264 if (dump_enabled_p ())
266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
267 "versioning for alias not supported for: "
268 "can't determine dependence between ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (dra));
271 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
272 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
273 DR_REF (drb));
274 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
276 return true;
279 if (dump_enabled_p ())
281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
282 "versioning for alias required: "
283 "can't determine dependence between ");
284 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
285 DR_REF (dra));
286 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
287 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
288 DR_REF (drb));
289 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
292 /* Add to list of ddrs that need to be tested at run-time. */
293 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
296 /* Known data dependence. */
297 if (DDR_NUM_DIST_VECTS (ddr) == 0)
299 /* If user asserted safelen consecutive iterations can be
300 executed concurrently, assume independence. */
301 if (loop->safelen >= 2)
303 if (loop->safelen < *max_vf)
304 *max_vf = loop->safelen;
305 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
306 return false;
309 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
310 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
312 if (dump_enabled_p ())
314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
315 "versioning for alias not supported for: "
316 "bad dist vector for ");
317 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
318 DR_REF (dra));
319 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
320 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
321 DR_REF (drb));
322 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
324 return true;
327 if (dump_enabled_p ())
329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
330 "versioning for alias required: "
331 "bad dist vector for ");
332 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
333 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
335 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
341 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
344 int dist = dist_v[loop_depth];
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE, vect_location,
348 "dependence distance = %d.\n", dist);
350 if (dist == 0)
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE, vect_location,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_NOTE, " and ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
362 /* When we perform grouped accesses and perform implicit CSE
363 by detecting equal accesses and doing disambiguation with
364 runtime alias tests like for
365 .. = a[i];
366 .. = a[i+1];
367 a[i] = ..;
368 a[i+1] = ..;
369 *p = ..;
370 .. = a[i];
371 .. = a[i+1];
372 where we will end up loading { a[i], a[i+1] } once, make
373 sure that inserting group loads before the first load and
374 stores after the last store will do the right thing.
375 Similar for groups like
376 a[i] = ...;
377 ... = a[i];
378 a[i+1] = ...;
379 where loads from the group interleave with the store. */
380 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
381 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
383 gimple *earlier_stmt;
384 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
385 if (DR_IS_WRITE
386 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "READ_WRITE dependence in interleaving."
391 "\n");
392 return true;
396 continue;
399 if (dist > 0 && DDR_REVERSED_P (ddr))
401 /* If DDR_REVERSED_P the order of the data-refs in DDR was
402 reversed (to make distance vector positive), and the actual
403 distance is negative. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 "dependence distance negative.\n");
407 /* Record a negative dependence distance to later limit the
408 amount of stmt copying / unrolling we can perform.
409 Only need to handle read-after-write dependence. */
410 if (DR_IS_READ (drb)
411 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
412 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
413 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
414 continue;
417 if (abs (dist) >= 2
418 && abs (dist) < *max_vf)
420 /* The dependence distance requires reduction of the maximal
421 vectorization factor. */
422 *max_vf = abs (dist);
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_NOTE, vect_location,
425 "adjusting maximal vectorization factor to %i\n",
426 *max_vf);
429 if (abs (dist) >= *max_vf)
431 /* Dependence distance does not create dependence, as far as
432 vectorization is concerned, in this case. */
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_NOTE, vect_location,
435 "dependence distance >= VF.\n");
436 continue;
439 if (dump_enabled_p ())
441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
442 "not vectorized, possible dependence "
443 "between data-refs ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
445 dump_printf (MSG_NOTE, " and ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
447 dump_printf (MSG_NOTE, "\n");
450 return true;
453 return false;
456 /* Function vect_analyze_data_ref_dependences.
458 Examine all the data references in the loop, and make sure there do not
459 exist any data dependences between them. Set *MAX_VF according to
460 the maximum vectorization factor the data dependences allow. */
462 bool
463 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
465 unsigned int i;
466 struct data_dependence_relation *ddr;
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "=== vect_analyze_data_ref_dependences ===\n");
472 LOOP_VINFO_DDRS (loop_vinfo)
473 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
474 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
475 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
476 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
477 &LOOP_VINFO_DDRS (loop_vinfo),
478 LOOP_VINFO_LOOP_NEST (loop_vinfo), false))
479 return false;
481 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
482 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
483 return false;
485 return true;
489 /* Function vect_slp_analyze_data_ref_dependence.
491 Return TRUE if there (might) exist a dependence between a memory-reference
492 DRA and a memory-reference DRB. When versioning for alias may check a
493 dependence at run-time, return FALSE. Adjust *MAX_VF according to
494 the data dependence. */
496 static bool
497 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
499 struct data_reference *dra = DDR_A (ddr);
500 struct data_reference *drb = DDR_B (ddr);
502 /* We need to check dependences of statements marked as unvectorizable
503 as well, they still can prohibit vectorization. */
505 /* Independent data accesses. */
506 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
507 return false;
509 if (dra == drb)
510 return false;
512 /* Read-read is OK. */
513 if (DR_IS_READ (dra) && DR_IS_READ (drb))
514 return false;
516 /* If dra and drb are part of the same interleaving chain consider
517 them independent. */
518 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
519 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
520 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
521 return false;
523 /* Unknown data dependence. */
524 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
526 if (dump_enabled_p ())
528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
529 "can't determine dependence between ");
530 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
531 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
532 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
533 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
536 else if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location,
539 "determined dependence between ");
540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
541 dump_printf (MSG_NOTE, " and ");
542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
543 dump_printf (MSG_NOTE, "\n");
546 return true;
550 /* Analyze dependences involved in the transform of SLP NODE. STORES
551 contain the vector of scalar stores of this instance if we are
552 disambiguating the loads. */
554 static bool
555 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
556 vec<gimple *> stores, gimple *last_store)
558 /* This walks over all stmts involved in the SLP load/store done
559 in NODE verifying we can sink them up to the last stmt in the
560 group. */
561 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
562 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
564 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
565 if (access == last_access)
566 continue;
567 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
568 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
569 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
571 gimple *stmt = gsi_stmt (gsi);
572 if (! gimple_vuse (stmt)
573 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
574 continue;
576 /* If we couldn't record a (single) data reference for this
577 stmt we have to give up. */
578 /* ??? Here and below if dependence analysis fails we can resort
579 to the alias oracle which can handle more kinds of stmts. */
580 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
581 if (!dr_b)
582 return false;
584 /* If we run into a store of this same instance (we've just
585 marked those) then delay dependence checking until we run
586 into the last store because this is where it will have
587 been sunk to (and we verify if we can do that as well). */
588 if (gimple_visited_p (stmt))
590 if (stmt != last_store)
591 continue;
592 unsigned i;
593 gimple *store;
594 FOR_EACH_VEC_ELT (stores, i, store)
596 data_reference *store_dr
597 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
598 ddr_p ddr = initialize_data_dependence_relation
599 (dr_a, store_dr, vNULL);
600 if (vect_slp_analyze_data_ref_dependence (ddr))
602 free_dependence_relation (ddr);
603 return false;
605 free_dependence_relation (ddr);
609 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
610 if (vect_slp_analyze_data_ref_dependence (ddr))
612 free_dependence_relation (ddr);
613 return false;
615 free_dependence_relation (ddr);
618 return true;
622 /* Function vect_analyze_data_ref_dependences.
624 Examine all the data references in the basic-block, and make sure there
625 do not exist any data dependences between them. Set *MAX_VF according to
626 the maximum vectorization factor the data dependences allow. */
628 bool
629 vect_slp_analyze_instance_dependence (slp_instance instance)
631 if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE, vect_location,
633 "=== vect_slp_analyze_instance_dependence ===\n");
635 /* The stores of this instance are at the root of the SLP tree. */
636 slp_tree store = SLP_INSTANCE_TREE (instance);
637 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
638 store = NULL;
640 /* Verify we can sink stores to the vectorized stmt insert location. */
641 gimple *last_store = NULL;
642 if (store)
644 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
645 return false;
647 /* Mark stores in this instance and remember the last one. */
648 last_store = vect_find_last_scalar_stmt_in_slp (store);
649 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
650 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
653 bool res = true;
655 /* Verify we can sink loads to the vectorized stmt insert location,
656 special-casing stores of this instance. */
657 slp_tree load;
658 unsigned int i;
659 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
660 if (! vect_slp_analyze_node_dependences (instance, load,
661 store
662 ? SLP_TREE_SCALAR_STMTS (store)
663 : vNULL, last_store))
665 res = false;
666 break;
669 /* Unset the visited flag. */
670 if (store)
671 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
672 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
674 return res;
677 /* Function vect_compute_data_ref_alignment
679 Compute the misalignment of the data reference DR.
681 Output:
682 1. If during the misalignment computation it is found that the data reference
683 cannot be vectorized then false is returned.
684 2. DR_MISALIGNMENT (DR) is defined.
686 FOR NOW: No analysis is actually performed. Misalignment is calculated
687 only for trivial cases. TODO. */
689 bool
690 vect_compute_data_ref_alignment (struct data_reference *dr)
692 gimple *stmt = DR_STMT (dr);
693 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
694 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
695 struct loop *loop = NULL;
696 tree ref = DR_REF (dr);
697 tree vectype;
698 tree base, base_addr;
699 tree misalign = NULL_TREE;
700 tree aligned_to;
701 unsigned HOST_WIDE_INT alignment;
703 if (dump_enabled_p ())
704 dump_printf_loc (MSG_NOTE, vect_location,
705 "vect_compute_data_ref_alignment:\n");
707 if (loop_vinfo)
708 loop = LOOP_VINFO_LOOP (loop_vinfo);
710 /* Initialize misalignment to unknown. */
711 SET_DR_MISALIGNMENT (dr, -1);
713 if (tree_fits_shwi_p (DR_STEP (dr)))
714 misalign = DR_INIT (dr);
715 aligned_to = DR_ALIGNED_TO (dr);
716 base_addr = DR_BASE_ADDRESS (dr);
717 vectype = STMT_VINFO_VECTYPE (stmt_info);
719 /* In case the dataref is in an inner-loop of the loop that is being
720 vectorized (LOOP), we use the base and misalignment information
721 relative to the outer-loop (LOOP). This is ok only if the misalignment
722 stays the same throughout the execution of the inner-loop, which is why
723 we have to check that the stride of the dataref in the inner-loop evenly
724 divides by the vector size. */
725 if (loop && nested_in_vect_loop_p (loop, stmt))
727 tree step = DR_STEP (dr);
729 if (tree_fits_shwi_p (step)
730 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
732 if (dump_enabled_p ())
733 dump_printf_loc (MSG_NOTE, vect_location,
734 "inner step divides the vector-size.\n");
735 misalign = STMT_VINFO_DR_INIT (stmt_info);
736 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
737 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
739 else
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "inner step doesn't divide the vector-size.\n");
744 misalign = NULL_TREE;
748 /* Similarly we can only use base and misalignment information relative to
749 an innermost loop if the misalignment stays the same throughout the
750 execution of the loop. As above, this is the case if the stride of
751 the dataref evenly divides by the vector size. */
752 else
754 tree step = DR_STEP (dr);
755 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
757 if (tree_fits_shwi_p (step)
758 && ((tree_to_shwi (step) * vf)
759 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
761 if (dump_enabled_p ())
762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
763 "step doesn't divide the vector-size.\n");
764 misalign = NULL_TREE;
768 /* To look at alignment of the base we have to preserve an inner MEM_REF
769 as that carries alignment information of the actual access. */
770 base = ref;
771 while (handled_component_p (base))
772 base = TREE_OPERAND (base, 0);
773 if (TREE_CODE (base) == MEM_REF)
774 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
775 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
776 unsigned int base_alignment = get_object_alignment (base);
778 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
779 DR_VECT_AUX (dr)->base_element_aligned = true;
781 alignment = TYPE_ALIGN_UNIT (vectype);
783 if ((compare_tree_int (aligned_to, alignment) < 0)
784 || !misalign)
786 if (dump_enabled_p ())
788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
789 "Unknown alignment for access: ");
790 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
791 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
793 return true;
796 if (base_alignment < TYPE_ALIGN (vectype))
798 /* Strip an inner MEM_REF to a bare decl if possible. */
799 if (TREE_CODE (base) == MEM_REF
800 && integer_zerop (TREE_OPERAND (base, 1))
801 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
802 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
804 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
806 if (dump_enabled_p ())
808 dump_printf_loc (MSG_NOTE, vect_location,
809 "can't force alignment of ref: ");
810 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
811 dump_printf (MSG_NOTE, "\n");
813 return true;
816 /* Force the alignment of the decl.
817 NOTE: This is the only change to the code we make during
818 the analysis phase, before deciding to vectorize the loop. */
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
822 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
823 dump_printf (MSG_NOTE, "\n");
826 DR_VECT_AUX (dr)->base_decl = base;
827 DR_VECT_AUX (dr)->base_misaligned = true;
828 DR_VECT_AUX (dr)->base_element_aligned = true;
831 /* If this is a backward running DR then first access in the larger
832 vectype actually is N-1 elements before the address in the DR.
833 Adjust misalign accordingly. */
834 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
836 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
837 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
838 otherwise we wouldn't be here. */
839 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
840 /* PLUS because DR_STEP was negative. */
841 misalign = size_binop (PLUS_EXPR, misalign, offset);
844 SET_DR_MISALIGNMENT (dr,
845 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
847 if (dump_enabled_p ())
849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
850 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
851 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
852 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
855 return true;
859 /* Function vect_update_misalignment_for_peel
861 DR - the data reference whose misalignment is to be adjusted.
862 DR_PEEL - the data reference whose misalignment is being made
863 zero in the vector loop by the peel.
864 NPEEL - the number of iterations in the peel loop if the misalignment
865 of DR_PEEL is known at compile time. */
867 static void
868 vect_update_misalignment_for_peel (struct data_reference *dr,
869 struct data_reference *dr_peel, int npeel)
871 unsigned int i;
872 vec<dr_p> same_align_drs;
873 struct data_reference *current_dr;
874 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
875 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
876 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
877 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
879 /* For interleaved data accesses the step in the loop must be multiplied by
880 the size of the interleaving group. */
881 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
882 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
883 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
884 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
886 /* It can be assumed that the data refs with the same alignment as dr_peel
887 are aligned in the vector loop. */
888 same_align_drs
889 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
890 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
892 if (current_dr != dr)
893 continue;
894 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
895 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
896 SET_DR_MISALIGNMENT (dr, 0);
897 return;
900 if (known_alignment_for_access_p (dr)
901 && known_alignment_for_access_p (dr_peel))
903 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
904 int misal = DR_MISALIGNMENT (dr);
905 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
906 misal += negative ? -npeel * dr_size : npeel * dr_size;
907 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
908 SET_DR_MISALIGNMENT (dr, misal);
909 return;
912 if (dump_enabled_p ())
913 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
914 SET_DR_MISALIGNMENT (dr, -1);
918 /* Function verify_data_ref_alignment
920 Return TRUE if DR can be handled with respect to alignment. */
922 static bool
923 verify_data_ref_alignment (data_reference_p dr)
925 enum dr_alignment_support supportable_dr_alignment
926 = vect_supportable_dr_alignment (dr, false);
927 if (!supportable_dr_alignment)
929 if (dump_enabled_p ())
931 if (DR_IS_READ (dr))
932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
933 "not vectorized: unsupported unaligned load.");
934 else
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "not vectorized: unsupported unaligned "
937 "store.");
939 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
940 DR_REF (dr));
941 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
943 return false;
946 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
947 dump_printf_loc (MSG_NOTE, vect_location,
948 "Vectorizing an unaligned access.\n");
950 return true;
953 /* Function vect_verify_datarefs_alignment
955 Return TRUE if all data references in the loop can be
956 handled with respect to alignment. */
958 bool
959 vect_verify_datarefs_alignment (loop_vec_info vinfo)
961 vec<data_reference_p> datarefs = vinfo->datarefs;
962 struct data_reference *dr;
963 unsigned int i;
965 FOR_EACH_VEC_ELT (datarefs, i, dr)
967 gimple *stmt = DR_STMT (dr);
968 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
970 if (!STMT_VINFO_RELEVANT_P (stmt_info))
971 continue;
973 /* For interleaving, only the alignment of the first access matters. */
974 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
975 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
976 continue;
978 /* Strided accesses perform only component accesses, alignment is
979 irrelevant for them. */
980 if (STMT_VINFO_STRIDED_P (stmt_info)
981 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
982 continue;
984 if (! verify_data_ref_alignment (dr))
985 return false;
988 return true;
991 /* Given an memory reference EXP return whether its alignment is less
992 than its size. */
994 static bool
995 not_size_aligned (tree exp)
997 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
998 return true;
1000 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1001 > get_object_alignment (exp));
1004 /* Function vector_alignment_reachable_p
1006 Return true if vector alignment for DR is reachable by peeling
1007 a few loop iterations. Return false otherwise. */
1009 static bool
1010 vector_alignment_reachable_p (struct data_reference *dr)
1012 gimple *stmt = DR_STMT (dr);
1013 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1014 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1016 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1018 /* For interleaved access we peel only if number of iterations in
1019 the prolog loop ({VF - misalignment}), is a multiple of the
1020 number of the interleaved accesses. */
1021 int elem_size, mis_in_elements;
1022 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1024 /* FORNOW: handle only known alignment. */
1025 if (!known_alignment_for_access_p (dr))
1026 return false;
1028 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1029 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1031 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1032 return false;
1035 /* If misalignment is known at the compile time then allow peeling
1036 only if natural alignment is reachable through peeling. */
1037 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1039 HOST_WIDE_INT elmsize =
1040 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1041 if (dump_enabled_p ())
1043 dump_printf_loc (MSG_NOTE, vect_location,
1044 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1045 dump_printf (MSG_NOTE,
1046 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1048 if (DR_MISALIGNMENT (dr) % elmsize)
1050 if (dump_enabled_p ())
1051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1052 "data size does not divide the misalignment.\n");
1053 return false;
1057 if (!known_alignment_for_access_p (dr))
1059 tree type = TREE_TYPE (DR_REF (dr));
1060 bool is_packed = not_size_aligned (DR_REF (dr));
1061 if (dump_enabled_p ())
1062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063 "Unknown misalignment, is_packed = %d\n",is_packed);
1064 if ((TYPE_USER_ALIGN (type) && !is_packed)
1065 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1066 return true;
1067 else
1068 return false;
1071 return true;
1075 /* Calculate the cost of the memory access represented by DR. */
1077 static void
1078 vect_get_data_access_cost (struct data_reference *dr,
1079 unsigned int *inside_cost,
1080 unsigned int *outside_cost,
1081 stmt_vector_for_cost *body_cost_vec)
1083 gimple *stmt = DR_STMT (dr);
1084 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1085 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1086 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1087 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1088 int ncopies = vf / nunits;
1090 if (DR_IS_READ (dr))
1091 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1092 NULL, body_cost_vec, false);
1093 else
1094 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1096 if (dump_enabled_p ())
1097 dump_printf_loc (MSG_NOTE, vect_location,
1098 "vect_get_data_access_cost: inside_cost = %d, "
1099 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1103 typedef struct _vect_peel_info
1105 int npeel;
1106 struct data_reference *dr;
1107 unsigned int count;
1108 } *vect_peel_info;
1110 typedef struct _vect_peel_extended_info
1112 struct _vect_peel_info peel_info;
1113 unsigned int inside_cost;
1114 unsigned int outside_cost;
1115 stmt_vector_for_cost body_cost_vec;
1116 } *vect_peel_extended_info;
1119 /* Peeling hashtable helpers. */
1121 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1123 static inline hashval_t hash (const _vect_peel_info *);
1124 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1127 inline hashval_t
1128 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1130 return (hashval_t) peel_info->npeel;
1133 inline bool
1134 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1136 return (a->npeel == b->npeel);
1140 /* Insert DR into peeling hash table with NPEEL as key. */
1142 static void
1143 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1144 loop_vec_info loop_vinfo, struct data_reference *dr,
1145 int npeel)
1147 struct _vect_peel_info elem, *slot;
1148 _vect_peel_info **new_slot;
1149 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1151 elem.npeel = npeel;
1152 slot = peeling_htab->find (&elem);
1153 if (slot)
1154 slot->count++;
1155 else
1157 slot = XNEW (struct _vect_peel_info);
1158 slot->npeel = npeel;
1159 slot->dr = dr;
1160 slot->count = 1;
1161 new_slot = peeling_htab->find_slot (slot, INSERT);
1162 *new_slot = slot;
1165 if (!supportable_dr_alignment
1166 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1167 slot->count += VECT_MAX_COST;
1171 /* Traverse peeling hash table to find peeling option that aligns maximum
1172 number of data accesses. */
1175 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1176 _vect_peel_extended_info *max)
1178 vect_peel_info elem = *slot;
1180 if (elem->count > max->peel_info.count
1181 || (elem->count == max->peel_info.count
1182 && max->peel_info.npeel > elem->npeel))
1184 max->peel_info.npeel = elem->npeel;
1185 max->peel_info.count = elem->count;
1186 max->peel_info.dr = elem->dr;
1189 return 1;
1193 /* Traverse peeling hash table and calculate cost for each peeling option.
1194 Find the one with the lowest cost. */
1197 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1198 _vect_peel_extended_info *min)
1200 vect_peel_info elem = *slot;
1201 int save_misalignment, dummy;
1202 unsigned int inside_cost = 0, outside_cost = 0, i;
1203 gimple *stmt = DR_STMT (elem->dr);
1204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1205 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1206 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1207 struct data_reference *dr;
1208 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1210 prologue_cost_vec.create (2);
1211 body_cost_vec.create (2);
1212 epilogue_cost_vec.create (2);
1214 FOR_EACH_VEC_ELT (datarefs, i, dr)
1216 stmt = DR_STMT (dr);
1217 stmt_info = vinfo_for_stmt (stmt);
1218 /* For interleaving, only the alignment of the first access
1219 matters. */
1220 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1221 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1222 continue;
1224 /* Strided accesses perform only component accesses, alignment is
1225 irrelevant for them. */
1226 if (STMT_VINFO_STRIDED_P (stmt_info)
1227 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1228 continue;
1230 save_misalignment = DR_MISALIGNMENT (dr);
1231 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1232 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1233 &body_cost_vec);
1234 SET_DR_MISALIGNMENT (dr, save_misalignment);
1237 outside_cost += vect_get_known_peeling_cost
1238 (loop_vinfo, elem->npeel, &dummy,
1239 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1240 &prologue_cost_vec, &epilogue_cost_vec);
1242 /* Prologue and epilogue costs are added to the target model later.
1243 These costs depend only on the scalar iteration cost, the
1244 number of peeling iterations finally chosen, and the number of
1245 misaligned statements. So discard the information found here. */
1246 prologue_cost_vec.release ();
1247 epilogue_cost_vec.release ();
1249 if (inside_cost < min->inside_cost
1250 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1252 min->inside_cost = inside_cost;
1253 min->outside_cost = outside_cost;
1254 min->body_cost_vec.release ();
1255 min->body_cost_vec = body_cost_vec;
1256 min->peel_info.dr = elem->dr;
1257 min->peel_info.npeel = elem->npeel;
1259 else
1260 body_cost_vec.release ();
1262 return 1;
1266 /* Choose best peeling option by traversing peeling hash table and either
1267 choosing an option with the lowest cost (if cost model is enabled) or the
1268 option that aligns as many accesses as possible. */
1270 static struct data_reference *
1271 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1272 loop_vec_info loop_vinfo,
1273 unsigned int *npeel,
1274 stmt_vector_for_cost *body_cost_vec)
1276 struct _vect_peel_extended_info res;
1278 res.peel_info.dr = NULL;
1279 res.body_cost_vec = stmt_vector_for_cost ();
1281 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1283 res.inside_cost = INT_MAX;
1284 res.outside_cost = INT_MAX;
1285 peeling_htab->traverse <_vect_peel_extended_info *,
1286 vect_peeling_hash_get_lowest_cost> (&res);
1288 else
1290 res.peel_info.count = 0;
1291 peeling_htab->traverse <_vect_peel_extended_info *,
1292 vect_peeling_hash_get_most_frequent> (&res);
1295 *npeel = res.peel_info.npeel;
1296 *body_cost_vec = res.body_cost_vec;
1297 return res.peel_info.dr;
1301 /* Function vect_enhance_data_refs_alignment
1303 This pass will use loop versioning and loop peeling in order to enhance
1304 the alignment of data references in the loop.
1306 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1307 original loop is to be vectorized. Any other loops that are created by
1308 the transformations performed in this pass - are not supposed to be
1309 vectorized. This restriction will be relaxed.
1311 This pass will require a cost model to guide it whether to apply peeling
1312 or versioning or a combination of the two. For example, the scheme that
1313 intel uses when given a loop with several memory accesses, is as follows:
1314 choose one memory access ('p') which alignment you want to force by doing
1315 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1316 other accesses are not necessarily aligned, or (2) use loop versioning to
1317 generate one loop in which all accesses are aligned, and another loop in
1318 which only 'p' is necessarily aligned.
1320 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1321 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1322 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1324 Devising a cost model is the most critical aspect of this work. It will
1325 guide us on which access to peel for, whether to use loop versioning, how
1326 many versions to create, etc. The cost model will probably consist of
1327 generic considerations as well as target specific considerations (on
1328 powerpc for example, misaligned stores are more painful than misaligned
1329 loads).
1331 Here are the general steps involved in alignment enhancements:
1333 -- original loop, before alignment analysis:
1334 for (i=0; i<N; i++){
1335 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1336 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1339 -- After vect_compute_data_refs_alignment:
1340 for (i=0; i<N; i++){
1341 x = q[i]; # DR_MISALIGNMENT(q) = 3
1342 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1345 -- Possibility 1: we do loop versioning:
1346 if (p is aligned) {
1347 for (i=0; i<N; i++){ # loop 1A
1348 x = q[i]; # DR_MISALIGNMENT(q) = 3
1349 p[i] = y; # DR_MISALIGNMENT(p) = 0
1352 else {
1353 for (i=0; i<N; i++){ # loop 1B
1354 x = q[i]; # DR_MISALIGNMENT(q) = 3
1355 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1359 -- Possibility 2: we do loop peeling:
1360 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1361 x = q[i];
1362 p[i] = y;
1364 for (i = 3; i < N; i++){ # loop 2A
1365 x = q[i]; # DR_MISALIGNMENT(q) = 0
1366 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1369 -- Possibility 3: combination of loop peeling and versioning:
1370 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1371 x = q[i];
1372 p[i] = y;
1374 if (p is aligned) {
1375 for (i = 3; i<N; i++){ # loop 3A
1376 x = q[i]; # DR_MISALIGNMENT(q) = 0
1377 p[i] = y; # DR_MISALIGNMENT(p) = 0
1380 else {
1381 for (i = 3; i<N; i++){ # loop 3B
1382 x = q[i]; # DR_MISALIGNMENT(q) = 0
1383 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1387 These loops are later passed to loop_transform to be vectorized. The
1388 vectorizer will use the alignment information to guide the transformation
1389 (whether to generate regular loads/stores, or with special handling for
1390 misalignment). */
1392 bool
1393 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1395 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1396 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1397 enum dr_alignment_support supportable_dr_alignment;
1398 struct data_reference *dr0 = NULL, *first_store = NULL;
1399 struct data_reference *dr;
1400 unsigned int i, j;
1401 bool do_peeling = false;
1402 bool do_versioning = false;
1403 bool stat;
1404 gimple *stmt;
1405 stmt_vec_info stmt_info;
1406 unsigned int npeel = 0;
1407 bool all_misalignments_unknown = true;
1408 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1409 unsigned possible_npeel_number = 1;
1410 tree vectype;
1411 unsigned int nelements, mis, same_align_drs_max = 0;
1412 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1413 hash_table<peel_info_hasher> peeling_htab (1);
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_NOTE, vect_location,
1417 "=== vect_enhance_data_refs_alignment ===\n");
1419 /* Reset data so we can safely be called multiple times. */
1420 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1421 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1423 /* While cost model enhancements are expected in the future, the high level
1424 view of the code at this time is as follows:
1426 A) If there is a misaligned access then see if peeling to align
1427 this access can make all data references satisfy
1428 vect_supportable_dr_alignment. If so, update data structures
1429 as needed and return true.
1431 B) If peeling wasn't possible and there is a data reference with an
1432 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1433 then see if loop versioning checks can be used to make all data
1434 references satisfy vect_supportable_dr_alignment. If so, update
1435 data structures as needed and return true.
1437 C) If neither peeling nor versioning were successful then return false if
1438 any data reference does not satisfy vect_supportable_dr_alignment.
1440 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1442 Note, Possibility 3 above (which is peeling and versioning together) is not
1443 being done at this time. */
1445 /* (1) Peeling to force alignment. */
1447 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1448 Considerations:
1449 + How many accesses will become aligned due to the peeling
1450 - How many accesses will become unaligned due to the peeling,
1451 and the cost of misaligned accesses.
1452 - The cost of peeling (the extra runtime checks, the increase
1453 in code size). */
1455 FOR_EACH_VEC_ELT (datarefs, i, dr)
1457 stmt = DR_STMT (dr);
1458 stmt_info = vinfo_for_stmt (stmt);
1460 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1461 continue;
1463 /* For interleaving, only the alignment of the first access
1464 matters. */
1465 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1466 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1467 continue;
1469 /* For invariant accesses there is nothing to enhance. */
1470 if (integer_zerop (DR_STEP (dr)))
1471 continue;
1473 /* Strided accesses perform only component accesses, alignment is
1474 irrelevant for them. */
1475 if (STMT_VINFO_STRIDED_P (stmt_info)
1476 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1477 continue;
1479 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1480 do_peeling = vector_alignment_reachable_p (dr);
1481 if (do_peeling)
1483 if (known_alignment_for_access_p (dr))
1485 unsigned int npeel_tmp;
1486 bool negative = tree_int_cst_compare (DR_STEP (dr),
1487 size_zero_node) < 0;
1489 /* Save info about DR in the hash table. */
1490 vectype = STMT_VINFO_VECTYPE (stmt_info);
1491 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1492 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1493 TREE_TYPE (DR_REF (dr))));
1494 npeel_tmp = (negative
1495 ? (mis - nelements) : (nelements - mis))
1496 & (nelements - 1);
1498 /* For multiple types, it is possible that the bigger type access
1499 will have more than one peeling option. E.g., a loop with two
1500 types: one of size (vector size / 4), and the other one of
1501 size (vector size / 8). Vectorization factor will 8. If both
1502 access are misaligned by 3, the first one needs one scalar
1503 iteration to be aligned, and the second one needs 5. But the
1504 first one will be aligned also by peeling 5 scalar
1505 iterations, and in that case both accesses will be aligned.
1506 Hence, except for the immediate peeling amount, we also want
1507 to try to add full vector size, while we don't exceed
1508 vectorization factor.
1509 We do this automtically for cost model, since we calculate cost
1510 for every peeling option. */
1511 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1513 if (STMT_SLP_TYPE (stmt_info))
1514 possible_npeel_number
1515 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1516 else
1517 possible_npeel_number = vf / nelements;
1520 /* Handle the aligned case. We may decide to align some other
1521 access, making DR unaligned. */
1522 if (DR_MISALIGNMENT (dr) == 0)
1524 npeel_tmp = 0;
1525 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1526 possible_npeel_number++;
1529 for (j = 0; j < possible_npeel_number; j++)
1531 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1532 dr, npeel_tmp);
1533 npeel_tmp += nelements;
1536 all_misalignments_unknown = false;
1537 /* Data-ref that was chosen for the case that all the
1538 misalignments are unknown is not relevant anymore, since we
1539 have a data-ref with known alignment. */
1540 dr0 = NULL;
1542 else
1544 /* If we don't know any misalignment values, we prefer
1545 peeling for data-ref that has the maximum number of data-refs
1546 with the same alignment, unless the target prefers to align
1547 stores over load. */
1548 if (all_misalignments_unknown)
1550 unsigned same_align_drs
1551 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1552 if (!dr0
1553 || same_align_drs_max < same_align_drs)
1555 same_align_drs_max = same_align_drs;
1556 dr0 = dr;
1558 /* For data-refs with the same number of related
1559 accesses prefer the one where the misalign
1560 computation will be invariant in the outermost loop. */
1561 else if (same_align_drs_max == same_align_drs)
1563 struct loop *ivloop0, *ivloop;
1564 ivloop0 = outermost_invariant_loop_for_expr
1565 (loop, DR_BASE_ADDRESS (dr0));
1566 ivloop = outermost_invariant_loop_for_expr
1567 (loop, DR_BASE_ADDRESS (dr));
1568 if ((ivloop && !ivloop0)
1569 || (ivloop && ivloop0
1570 && flow_loop_nested_p (ivloop, ivloop0)))
1571 dr0 = dr;
1574 if (!first_store && DR_IS_WRITE (dr))
1575 first_store = dr;
1578 /* If there are both known and unknown misaligned accesses in the
1579 loop, we choose peeling amount according to the known
1580 accesses. */
1581 if (!supportable_dr_alignment)
1583 dr0 = dr;
1584 if (!first_store && DR_IS_WRITE (dr))
1585 first_store = dr;
1589 else
1591 if (!aligned_access_p (dr))
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1595 "vector alignment may not be reachable\n");
1596 break;
1601 /* Check if we can possibly peel the loop. */
1602 if (!vect_can_advance_ivs_p (loop_vinfo)
1603 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1604 || loop->inner)
1605 do_peeling = false;
1607 if (do_peeling
1608 && all_misalignments_unknown
1609 && vect_supportable_dr_alignment (dr0, false))
1611 /* Check if the target requires to prefer stores over loads, i.e., if
1612 misaligned stores are more expensive than misaligned loads (taking
1613 drs with same alignment into account). */
1614 if (first_store && DR_IS_READ (dr0))
1616 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1617 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1618 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1619 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1620 stmt_vector_for_cost dummy;
1621 dummy.create (2);
1623 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1624 &dummy);
1625 vect_get_data_access_cost (first_store, &store_inside_cost,
1626 &store_outside_cost, &dummy);
1628 dummy.release ();
1630 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1631 aligning the load DR0). */
1632 load_inside_penalty = store_inside_cost;
1633 load_outside_penalty = store_outside_cost;
1634 for (i = 0;
1635 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1636 DR_STMT (first_store))).iterate (i, &dr);
1637 i++)
1638 if (DR_IS_READ (dr))
1640 load_inside_penalty += load_inside_cost;
1641 load_outside_penalty += load_outside_cost;
1643 else
1645 load_inside_penalty += store_inside_cost;
1646 load_outside_penalty += store_outside_cost;
1649 /* Calculate the penalty for leaving DR0 unaligned (by
1650 aligning the FIRST_STORE). */
1651 store_inside_penalty = load_inside_cost;
1652 store_outside_penalty = load_outside_cost;
1653 for (i = 0;
1654 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1655 DR_STMT (dr0))).iterate (i, &dr);
1656 i++)
1657 if (DR_IS_READ (dr))
1659 store_inside_penalty += load_inside_cost;
1660 store_outside_penalty += load_outside_cost;
1662 else
1664 store_inside_penalty += store_inside_cost;
1665 store_outside_penalty += store_outside_cost;
1668 if (load_inside_penalty > store_inside_penalty
1669 || (load_inside_penalty == store_inside_penalty
1670 && load_outside_penalty > store_outside_penalty))
1671 dr0 = first_store;
1674 /* In case there are only loads with different unknown misalignments, use
1675 peeling only if it may help to align other accesses in the loop or
1676 if it may help improving load bandwith when we'd end up using
1677 unaligned loads. */
1678 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1679 if (!first_store
1680 && !STMT_VINFO_SAME_ALIGN_REFS (
1681 vinfo_for_stmt (DR_STMT (dr0))).length ()
1682 && (vect_supportable_dr_alignment (dr0, false)
1683 != dr_unaligned_supported
1684 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1685 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1686 do_peeling = false;
1689 if (do_peeling && !dr0)
1691 /* Peeling is possible, but there is no data access that is not supported
1692 unless aligned. So we try to choose the best possible peeling. */
1694 /* We should get here only if there are drs with known misalignment. */
1695 gcc_assert (!all_misalignments_unknown);
1697 /* Choose the best peeling from the hash table. */
1698 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1699 loop_vinfo, &npeel,
1700 &body_cost_vec);
1701 if (!dr0 || !npeel)
1702 do_peeling = false;
1705 if (do_peeling)
1707 stmt = DR_STMT (dr0);
1708 stmt_info = vinfo_for_stmt (stmt);
1709 vectype = STMT_VINFO_VECTYPE (stmt_info);
1710 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1712 if (known_alignment_for_access_p (dr0))
1714 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1715 size_zero_node) < 0;
1716 if (!npeel)
1718 /* Since it's known at compile time, compute the number of
1719 iterations in the peeled loop (the peeling factor) for use in
1720 updating DR_MISALIGNMENT values. The peeling factor is the
1721 vectorization factor minus the misalignment as an element
1722 count. */
1723 mis = DR_MISALIGNMENT (dr0);
1724 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1725 npeel = ((negative ? mis - nelements : nelements - mis)
1726 & (nelements - 1));
1729 /* For interleaved data access every iteration accesses all the
1730 members of the group, therefore we divide the number of iterations
1731 by the group size. */
1732 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1733 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1734 npeel /= GROUP_SIZE (stmt_info);
1736 if (dump_enabled_p ())
1737 dump_printf_loc (MSG_NOTE, vect_location,
1738 "Try peeling by %d\n", npeel);
1741 /* Ensure that all data refs can be vectorized after the peel. */
1742 FOR_EACH_VEC_ELT (datarefs, i, dr)
1744 int save_misalignment;
1746 if (dr == dr0)
1747 continue;
1749 stmt = DR_STMT (dr);
1750 stmt_info = vinfo_for_stmt (stmt);
1751 /* For interleaving, only the alignment of the first access
1752 matters. */
1753 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1754 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1755 continue;
1757 /* Strided accesses perform only component accesses, alignment is
1758 irrelevant for them. */
1759 if (STMT_VINFO_STRIDED_P (stmt_info)
1760 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1761 continue;
1763 save_misalignment = DR_MISALIGNMENT (dr);
1764 vect_update_misalignment_for_peel (dr, dr0, npeel);
1765 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1766 SET_DR_MISALIGNMENT (dr, save_misalignment);
1768 if (!supportable_dr_alignment)
1770 do_peeling = false;
1771 break;
1775 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1777 stat = vect_verify_datarefs_alignment (loop_vinfo);
1778 if (!stat)
1779 do_peeling = false;
1780 else
1782 body_cost_vec.release ();
1783 return stat;
1787 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1788 if (do_peeling)
1790 unsigned max_allowed_peel
1791 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1792 if (max_allowed_peel != (unsigned)-1)
1794 unsigned max_peel = npeel;
1795 if (max_peel == 0)
1797 gimple *dr_stmt = DR_STMT (dr0);
1798 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1799 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1800 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1802 if (max_peel > max_allowed_peel)
1804 do_peeling = false;
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE, vect_location,
1807 "Disable peeling, max peels reached: %d\n", max_peel);
1812 /* Cost model #2 - if peeling may result in a remaining loop not
1813 iterating enough to be vectorized then do not peel. */
1814 if (do_peeling
1815 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1817 unsigned max_peel
1818 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1819 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1820 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1821 do_peeling = false;
1824 if (do_peeling)
1826 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1827 If the misalignment of DR_i is identical to that of dr0 then set
1828 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1829 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1830 by the peeling factor times the element size of DR_i (MOD the
1831 vectorization factor times the size). Otherwise, the
1832 misalignment of DR_i must be set to unknown. */
1833 FOR_EACH_VEC_ELT (datarefs, i, dr)
1834 if (dr != dr0)
1836 /* Strided accesses perform only component accesses, alignment
1837 is irrelevant for them. */
1838 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1839 if (STMT_VINFO_STRIDED_P (stmt_info)
1840 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1841 continue;
1843 vect_update_misalignment_for_peel (dr, dr0, npeel);
1846 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1847 if (npeel)
1848 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1849 else
1850 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1851 = DR_MISALIGNMENT (dr0);
1852 SET_DR_MISALIGNMENT (dr0, 0);
1853 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_NOTE, vect_location,
1856 "Alignment of access forced using peeling.\n");
1857 dump_printf_loc (MSG_NOTE, vect_location,
1858 "Peeling for alignment will be applied.\n");
1860 /* The inside-loop cost will be accounted for in vectorizable_load
1861 and vectorizable_store correctly with adjusted alignments.
1862 Drop the body_cst_vec on the floor here. */
1863 body_cost_vec.release ();
1865 stat = vect_verify_datarefs_alignment (loop_vinfo);
1866 gcc_assert (stat);
1867 return stat;
1871 body_cost_vec.release ();
1873 /* (2) Versioning to force alignment. */
1875 /* Try versioning if:
1876 1) optimize loop for speed
1877 2) there is at least one unsupported misaligned data ref with an unknown
1878 misalignment, and
1879 3) all misaligned data refs with a known misalignment are supported, and
1880 4) the number of runtime alignment checks is within reason. */
1882 do_versioning =
1883 optimize_loop_nest_for_speed_p (loop)
1884 && (!loop->inner); /* FORNOW */
1886 if (do_versioning)
1888 FOR_EACH_VEC_ELT (datarefs, i, dr)
1890 stmt = DR_STMT (dr);
1891 stmt_info = vinfo_for_stmt (stmt);
1893 /* For interleaving, only the alignment of the first access
1894 matters. */
1895 if (aligned_access_p (dr)
1896 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1897 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1898 continue;
1900 if (STMT_VINFO_STRIDED_P (stmt_info))
1902 /* Strided loads perform only component accesses, alignment is
1903 irrelevant for them. */
1904 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1905 continue;
1906 do_versioning = false;
1907 break;
1910 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1912 if (!supportable_dr_alignment)
1914 gimple *stmt;
1915 int mask;
1916 tree vectype;
1918 if (known_alignment_for_access_p (dr)
1919 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1920 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1922 do_versioning = false;
1923 break;
1926 stmt = DR_STMT (dr);
1927 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1928 gcc_assert (vectype);
1930 /* The rightmost bits of an aligned address must be zeros.
1931 Construct the mask needed for this test. For example,
1932 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1933 mask must be 15 = 0xf. */
1934 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1936 /* FORNOW: use the same mask to test all potentially unaligned
1937 references in the loop. The vectorizer currently supports
1938 a single vector size, see the reference to
1939 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1940 vectorization factor is computed. */
1941 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1942 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1943 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1944 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1945 DR_STMT (dr));
1949 /* Versioning requires at least one misaligned data reference. */
1950 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1951 do_versioning = false;
1952 else if (!do_versioning)
1953 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1956 if (do_versioning)
1958 vec<gimple *> may_misalign_stmts
1959 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1960 gimple *stmt;
1962 /* It can now be assumed that the data references in the statements
1963 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1964 of the loop being vectorized. */
1965 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1967 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1968 dr = STMT_VINFO_DATA_REF (stmt_info);
1969 SET_DR_MISALIGNMENT (dr, 0);
1970 if (dump_enabled_p ())
1971 dump_printf_loc (MSG_NOTE, vect_location,
1972 "Alignment of access forced using versioning.\n");
1975 if (dump_enabled_p ())
1976 dump_printf_loc (MSG_NOTE, vect_location,
1977 "Versioning for alignment will be applied.\n");
1979 /* Peeling and versioning can't be done together at this time. */
1980 gcc_assert (! (do_peeling && do_versioning));
1982 stat = vect_verify_datarefs_alignment (loop_vinfo);
1983 gcc_assert (stat);
1984 return stat;
1987 /* This point is reached if neither peeling nor versioning is being done. */
1988 gcc_assert (! (do_peeling || do_versioning));
1990 stat = vect_verify_datarefs_alignment (loop_vinfo);
1991 return stat;
1995 /* Function vect_find_same_alignment_drs.
1997 Update group and alignment relations according to the chosen
1998 vectorization factor. */
2000 static void
2001 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2002 loop_vec_info loop_vinfo)
2004 unsigned int i;
2005 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2006 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2007 struct data_reference *dra = DDR_A (ddr);
2008 struct data_reference *drb = DDR_B (ddr);
2009 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2010 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2011 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2012 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2013 lambda_vector dist_v;
2014 unsigned int loop_depth;
2016 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2017 return;
2019 if (dra == drb)
2020 return;
2022 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2023 return;
2025 /* Loop-based vectorization and known data dependence. */
2026 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2027 return;
2029 /* Data-dependence analysis reports a distance vector of zero
2030 for data-references that overlap only in the first iteration
2031 but have different sign step (see PR45764).
2032 So as a sanity check require equal DR_STEP. */
2033 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2034 return;
2036 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2037 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2039 int dist = dist_v[loop_depth];
2041 if (dump_enabled_p ())
2042 dump_printf_loc (MSG_NOTE, vect_location,
2043 "dependence distance = %d.\n", dist);
2045 /* Same loop iteration. */
2046 if (dist == 0
2047 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2049 /* Two references with distance zero have the same alignment. */
2050 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2051 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2052 if (dump_enabled_p ())
2054 dump_printf_loc (MSG_NOTE, vect_location,
2055 "accesses have the same alignment.\n");
2056 dump_printf (MSG_NOTE,
2057 "dependence distance modulo vf == 0 between ");
2058 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2059 dump_printf (MSG_NOTE, " and ");
2060 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2061 dump_printf (MSG_NOTE, "\n");
2068 /* Function vect_analyze_data_refs_alignment
2070 Analyze the alignment of the data-references in the loop.
2071 Return FALSE if a data reference is found that cannot be vectorized. */
2073 bool
2074 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_NOTE, vect_location,
2078 "=== vect_analyze_data_refs_alignment ===\n");
2080 /* Mark groups of data references with same alignment using
2081 data dependence information. */
2082 vec<ddr_p> ddrs = vinfo->ddrs;
2083 struct data_dependence_relation *ddr;
2084 unsigned int i;
2086 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2087 vect_find_same_alignment_drs (ddr, vinfo);
2089 vec<data_reference_p> datarefs = vinfo->datarefs;
2090 struct data_reference *dr;
2092 FOR_EACH_VEC_ELT (datarefs, i, dr)
2094 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2095 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2096 && !vect_compute_data_ref_alignment (dr))
2098 /* Strided accesses perform only component accesses, misalignment
2099 information is irrelevant for them. */
2100 if (STMT_VINFO_STRIDED_P (stmt_info)
2101 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2102 continue;
2104 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2106 "not vectorized: can't calculate alignment "
2107 "for data ref.\n");
2109 return false;
2113 return true;
2117 /* Analyze alignment of DRs of stmts in NODE. */
2119 static bool
2120 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2122 /* We vectorize from the first scalar stmt in the node unless
2123 the node is permuted in which case we start from the first
2124 element in the group. */
2125 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2126 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2127 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2128 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2130 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2131 if (! vect_compute_data_ref_alignment (dr)
2132 /* For creating the data-ref pointer we need alignment of the
2133 first element anyway. */
2134 || (dr != first_dr
2135 && ! vect_compute_data_ref_alignment (first_dr))
2136 || ! verify_data_ref_alignment (dr))
2138 if (dump_enabled_p ())
2139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2140 "not vectorized: bad data alignment in basic "
2141 "block.\n");
2142 return false;
2145 return true;
2148 /* Function vect_slp_analyze_instance_alignment
2150 Analyze the alignment of the data-references in the SLP instance.
2151 Return FALSE if a data reference is found that cannot be vectorized. */
2153 bool
2154 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2156 if (dump_enabled_p ())
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2160 slp_tree node;
2161 unsigned i;
2162 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2163 if (! vect_slp_analyze_and_verify_node_alignment (node))
2164 return false;
2166 node = SLP_INSTANCE_TREE (instance);
2167 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2168 && ! vect_slp_analyze_and_verify_node_alignment
2169 (SLP_INSTANCE_TREE (instance)))
2170 return false;
2172 return true;
2176 /* Analyze groups of accesses: check that DR belongs to a group of
2177 accesses of legal size, step, etc. Detect gaps, single element
2178 interleaving, and other special cases. Set grouped access info.
2179 Collect groups of strided stores for further use in SLP analysis.
2180 Worker for vect_analyze_group_access. */
2182 static bool
2183 vect_analyze_group_access_1 (struct data_reference *dr)
2185 tree step = DR_STEP (dr);
2186 tree scalar_type = TREE_TYPE (DR_REF (dr));
2187 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2188 gimple *stmt = DR_STMT (dr);
2189 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2190 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2191 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2192 HOST_WIDE_INT dr_step = -1;
2193 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2194 bool slp_impossible = false;
2196 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2197 size of the interleaving group (including gaps). */
2198 if (tree_fits_shwi_p (step))
2200 dr_step = tree_to_shwi (step);
2201 /* Check that STEP is a multiple of type size. Otherwise there is
2202 a non-element-sized gap at the end of the group which we
2203 cannot represent in GROUP_GAP or GROUP_SIZE.
2204 ??? As we can handle non-constant step fine here we should
2205 simply remove uses of GROUP_GAP between the last and first
2206 element and instead rely on DR_STEP. GROUP_SIZE then would
2207 simply not include that gap. */
2208 if ((dr_step % type_size) != 0)
2210 if (dump_enabled_p ())
2212 dump_printf_loc (MSG_NOTE, vect_location,
2213 "Step ");
2214 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2215 dump_printf (MSG_NOTE,
2216 " is not a multiple of the element size for ");
2217 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2218 dump_printf (MSG_NOTE, "\n");
2220 return false;
2222 groupsize = absu_hwi (dr_step) / type_size;
2224 else
2225 groupsize = 0;
2227 /* Not consecutive access is possible only if it is a part of interleaving. */
2228 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2230 /* Check if it this DR is a part of interleaving, and is a single
2231 element of the group that is accessed in the loop. */
2233 /* Gaps are supported only for loads. STEP must be a multiple of the type
2234 size. The size of the group must be a power of 2. */
2235 if (DR_IS_READ (dr)
2236 && (dr_step % type_size) == 0
2237 && groupsize > 0
2238 && exact_log2 (groupsize) != -1)
2240 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2241 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2242 GROUP_GAP (stmt_info) = groupsize - 1;
2243 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_NOTE, vect_location,
2246 "Detected single element interleaving ");
2247 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2248 dump_printf (MSG_NOTE, " step ");
2249 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2250 dump_printf (MSG_NOTE, "\n");
2253 return true;
2256 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "not consecutive access ");
2260 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2263 if (bb_vinfo)
2265 /* Mark the statement as unvectorizable. */
2266 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2267 return true;
2270 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2271 STMT_VINFO_STRIDED_P (stmt_info) = true;
2272 return true;
2275 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2277 /* First stmt in the interleaving chain. Check the chain. */
2278 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2279 struct data_reference *data_ref = dr;
2280 unsigned int count = 1;
2281 tree prev_init = DR_INIT (data_ref);
2282 gimple *prev = stmt;
2283 HOST_WIDE_INT diff, gaps = 0;
2285 while (next)
2287 /* Skip same data-refs. In case that two or more stmts share
2288 data-ref (supported only for loads), we vectorize only the first
2289 stmt, and the rest get their vectorized loads from the first
2290 one. */
2291 if (!tree_int_cst_compare (DR_INIT (data_ref),
2292 DR_INIT (STMT_VINFO_DATA_REF (
2293 vinfo_for_stmt (next)))))
2295 if (DR_IS_WRITE (data_ref))
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299 "Two store stmts share the same dr.\n");
2300 return false;
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2305 "Two or more load stmts share the same dr.\n");
2307 /* For load use the same data-ref load. */
2308 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2310 prev = next;
2311 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2312 continue;
2315 prev = next;
2316 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2318 /* All group members have the same STEP by construction. */
2319 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2321 /* Check that the distance between two accesses is equal to the type
2322 size. Otherwise, we have gaps. */
2323 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2324 - TREE_INT_CST_LOW (prev_init)) / type_size;
2325 if (diff != 1)
2327 /* FORNOW: SLP of accesses with gaps is not supported. */
2328 slp_impossible = true;
2329 if (DR_IS_WRITE (data_ref))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "interleaved store with gaps\n");
2334 return false;
2337 gaps += diff - 1;
2340 last_accessed_element += diff;
2342 /* Store the gap from the previous member of the group. If there is no
2343 gap in the access, GROUP_GAP is always 1. */
2344 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2346 prev_init = DR_INIT (data_ref);
2347 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2348 /* Count the number of data-refs in the chain. */
2349 count++;
2352 if (groupsize == 0)
2353 groupsize = count + gaps;
2355 if (groupsize > UINT_MAX)
2357 if (dump_enabled_p ())
2358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359 "group is too large\n");
2360 return false;
2363 /* Check that the size of the interleaving is equal to count for stores,
2364 i.e., that there are no gaps. */
2365 if (groupsize != count
2366 && !DR_IS_READ (dr))
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2370 "interleaved store with gaps\n");
2371 return false;
2374 /* If there is a gap after the last load in the group it is the
2375 difference between the groupsize and the last accessed
2376 element.
2377 When there is no gap, this difference should be 0. */
2378 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2380 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2381 if (dump_enabled_p ())
2383 dump_printf_loc (MSG_NOTE, vect_location,
2384 "Detected interleaving ");
2385 if (DR_IS_READ (dr))
2386 dump_printf (MSG_NOTE, "load ");
2387 else
2388 dump_printf (MSG_NOTE, "store ");
2389 dump_printf (MSG_NOTE, "of size %u starting with ",
2390 (unsigned)groupsize);
2391 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2392 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2393 dump_printf_loc (MSG_NOTE, vect_location,
2394 "There is a gap of %u elements after the group\n",
2395 GROUP_GAP (vinfo_for_stmt (stmt)));
2398 /* SLP: create an SLP data structure for every interleaving group of
2399 stores for further analysis in vect_analyse_slp. */
2400 if (DR_IS_WRITE (dr) && !slp_impossible)
2402 if (loop_vinfo)
2403 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2404 if (bb_vinfo)
2405 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2409 return true;
2412 /* Analyze groups of accesses: check that DR belongs to a group of
2413 accesses of legal size, step, etc. Detect gaps, single element
2414 interleaving, and other special cases. Set grouped access info.
2415 Collect groups of strided stores for further use in SLP analysis. */
2417 static bool
2418 vect_analyze_group_access (struct data_reference *dr)
2420 if (!vect_analyze_group_access_1 (dr))
2422 /* Dissolve the group if present. */
2423 gimple *next;
2424 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2425 while (stmt)
2427 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2428 next = GROUP_NEXT_ELEMENT (vinfo);
2429 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2430 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2431 stmt = next;
2433 return false;
2435 return true;
2438 /* Analyze the access pattern of the data-reference DR.
2439 In case of non-consecutive accesses call vect_analyze_group_access() to
2440 analyze groups of accesses. */
2442 static bool
2443 vect_analyze_data_ref_access (struct data_reference *dr)
2445 tree step = DR_STEP (dr);
2446 tree scalar_type = TREE_TYPE (DR_REF (dr));
2447 gimple *stmt = DR_STMT (dr);
2448 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2449 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2450 struct loop *loop = NULL;
2452 if (loop_vinfo)
2453 loop = LOOP_VINFO_LOOP (loop_vinfo);
2455 if (loop_vinfo && !step)
2457 if (dump_enabled_p ())
2458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2459 "bad data-ref access in loop\n");
2460 return false;
2463 /* Allow loads with zero step in inner-loop vectorization. */
2464 if (loop_vinfo && integer_zerop (step))
2466 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2467 if (!nested_in_vect_loop_p (loop, stmt))
2468 return DR_IS_READ (dr);
2469 /* Allow references with zero step for outer loops marked
2470 with pragma omp simd only - it guarantees absence of
2471 loop-carried dependencies between inner loop iterations. */
2472 if (!loop->force_vectorize)
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE, vect_location,
2476 "zero step in inner loop of nest\n");
2477 return false;
2481 if (loop && nested_in_vect_loop_p (loop, stmt))
2483 /* Interleaved accesses are not yet supported within outer-loop
2484 vectorization for references in the inner-loop. */
2485 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2487 /* For the rest of the analysis we use the outer-loop step. */
2488 step = STMT_VINFO_DR_STEP (stmt_info);
2489 if (integer_zerop (step))
2491 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_NOTE, vect_location,
2493 "zero step in outer loop.\n");
2494 return DR_IS_READ (dr);
2498 /* Consecutive? */
2499 if (TREE_CODE (step) == INTEGER_CST)
2501 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2502 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2503 || (dr_step < 0
2504 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2506 /* Mark that it is not interleaving. */
2507 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2508 return true;
2512 if (loop && nested_in_vect_loop_p (loop, stmt))
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location,
2516 "grouped access in outer loop.\n");
2517 return false;
2521 /* Assume this is a DR handled by non-constant strided load case. */
2522 if (TREE_CODE (step) != INTEGER_CST)
2523 return (STMT_VINFO_STRIDED_P (stmt_info)
2524 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2525 || vect_analyze_group_access (dr)));
2527 /* Not consecutive access - check if it's a part of interleaving group. */
2528 return vect_analyze_group_access (dr);
2533 /* A helper function used in the comparator function to sort data
2534 references. T1 and T2 are two data references to be compared.
2535 The function returns -1, 0, or 1. */
2537 static int
2538 compare_tree (tree t1, tree t2)
2540 int i, cmp;
2541 enum tree_code code;
2542 char tclass;
2544 if (t1 == t2)
2545 return 0;
2546 if (t1 == NULL)
2547 return -1;
2548 if (t2 == NULL)
2549 return 1;
2551 STRIP_NOPS (t1);
2552 STRIP_NOPS (t2);
2554 if (TREE_CODE (t1) != TREE_CODE (t2))
2555 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2557 code = TREE_CODE (t1);
2558 switch (code)
2560 /* For const values, we can just use hash values for comparisons. */
2561 case INTEGER_CST:
2562 case REAL_CST:
2563 case FIXED_CST:
2564 case STRING_CST:
2565 case COMPLEX_CST:
2566 case VECTOR_CST:
2568 hashval_t h1 = iterative_hash_expr (t1, 0);
2569 hashval_t h2 = iterative_hash_expr (t2, 0);
2570 if (h1 != h2)
2571 return h1 < h2 ? -1 : 1;
2572 break;
2575 case SSA_NAME:
2576 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2577 if (cmp != 0)
2578 return cmp;
2580 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2581 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2582 break;
2584 default:
2585 tclass = TREE_CODE_CLASS (code);
2587 /* For var-decl, we could compare their UIDs. */
2588 if (tclass == tcc_declaration)
2590 if (DECL_UID (t1) != DECL_UID (t2))
2591 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2592 break;
2595 /* For expressions with operands, compare their operands recursively. */
2596 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2598 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2599 if (cmp != 0)
2600 return cmp;
2604 return 0;
2608 /* Compare two data-references DRA and DRB to group them into chunks
2609 suitable for grouping. */
2611 static int
2612 dr_group_sort_cmp (const void *dra_, const void *drb_)
2614 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2615 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2616 int cmp;
2618 /* Stabilize sort. */
2619 if (dra == drb)
2620 return 0;
2622 /* DRs in different loops never belong to the same group. */
2623 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2624 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2625 if (loopa != loopb)
2626 return loopa->num < loopb->num ? -1 : 1;
2628 /* Ordering of DRs according to base. */
2629 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2631 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2632 if (cmp != 0)
2633 return cmp;
2636 /* And according to DR_OFFSET. */
2637 if (!dr_equal_offsets_p (dra, drb))
2639 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2640 if (cmp != 0)
2641 return cmp;
2644 /* Put reads before writes. */
2645 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2646 return DR_IS_READ (dra) ? -1 : 1;
2648 /* Then sort after access size. */
2649 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2650 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2652 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2653 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2654 if (cmp != 0)
2655 return cmp;
2658 /* And after step. */
2659 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2661 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2662 if (cmp != 0)
2663 return cmp;
2666 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2667 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2668 if (cmp == 0)
2669 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2670 return cmp;
2673 /* Function vect_analyze_data_ref_accesses.
2675 Analyze the access pattern of all the data references in the loop.
2677 FORNOW: the only access pattern that is considered vectorizable is a
2678 simple step 1 (consecutive) access.
2680 FORNOW: handle only arrays and pointer accesses. */
2682 bool
2683 vect_analyze_data_ref_accesses (vec_info *vinfo)
2685 unsigned int i;
2686 vec<data_reference_p> datarefs = vinfo->datarefs;
2687 struct data_reference *dr;
2689 if (dump_enabled_p ())
2690 dump_printf_loc (MSG_NOTE, vect_location,
2691 "=== vect_analyze_data_ref_accesses ===\n");
2693 if (datarefs.is_empty ())
2694 return true;
2696 /* Sort the array of datarefs to make building the interleaving chains
2697 linear. Don't modify the original vector's order, it is needed for
2698 determining what dependencies are reversed. */
2699 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2700 datarefs_copy.qsort (dr_group_sort_cmp);
2702 /* Build the interleaving chains. */
2703 for (i = 0; i < datarefs_copy.length () - 1;)
2705 data_reference_p dra = datarefs_copy[i];
2706 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2707 stmt_vec_info lastinfo = NULL;
2708 for (i = i + 1; i < datarefs_copy.length (); ++i)
2710 data_reference_p drb = datarefs_copy[i];
2711 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2713 /* ??? Imperfect sorting (non-compatible types, non-modulo
2714 accesses, same accesses) can lead to a group to be artificially
2715 split here as we don't just skip over those. If it really
2716 matters we can push those to a worklist and re-iterate
2717 over them. The we can just skip ahead to the next DR here. */
2719 /* DRs in a different loop should not be put into the same
2720 interleaving group. */
2721 if (gimple_bb (DR_STMT (dra))->loop_father
2722 != gimple_bb (DR_STMT (drb))->loop_father)
2723 break;
2725 /* Check that the data-refs have same first location (except init)
2726 and they are both either store or load (not load and store,
2727 not masked loads or stores). */
2728 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2729 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2730 DR_BASE_ADDRESS (drb), 0)
2731 || !dr_equal_offsets_p (dra, drb)
2732 || !gimple_assign_single_p (DR_STMT (dra))
2733 || !gimple_assign_single_p (DR_STMT (drb)))
2734 break;
2736 /* Check that the data-refs have the same constant size. */
2737 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2738 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2739 if (!tree_fits_uhwi_p (sza)
2740 || !tree_fits_uhwi_p (szb)
2741 || !tree_int_cst_equal (sza, szb))
2742 break;
2744 /* Check that the data-refs have the same step. */
2745 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2746 break;
2748 /* Do not place the same access in the interleaving chain twice. */
2749 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2750 break;
2752 /* Check the types are compatible.
2753 ??? We don't distinguish this during sorting. */
2754 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2755 TREE_TYPE (DR_REF (drb))))
2756 break;
2758 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2759 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2760 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2761 gcc_assert (init_a <= init_b);
2763 /* If init_b == init_a + the size of the type * k, we have an
2764 interleaving, and DRA is accessed before DRB. */
2765 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2766 if (type_size_a == 0
2767 || (init_b - init_a) % type_size_a != 0)
2768 break;
2770 /* If we have a store, the accesses are adjacent. This splits
2771 groups into chunks we support (we don't support vectorization
2772 of stores with gaps). */
2773 if (!DR_IS_READ (dra)
2774 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2775 (DR_INIT (datarefs_copy[i-1]))
2776 != type_size_a))
2777 break;
2779 /* If the step (if not zero or non-constant) is greater than the
2780 difference between data-refs' inits this splits groups into
2781 suitable sizes. */
2782 if (tree_fits_shwi_p (DR_STEP (dra)))
2784 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2785 if (step != 0 && step <= (init_b - init_a))
2786 break;
2789 if (dump_enabled_p ())
2791 dump_printf_loc (MSG_NOTE, vect_location,
2792 "Detected interleaving ");
2793 if (DR_IS_READ (dra))
2794 dump_printf (MSG_NOTE, "load ");
2795 else
2796 dump_printf (MSG_NOTE, "store ");
2797 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2798 dump_printf (MSG_NOTE, " and ");
2799 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2800 dump_printf (MSG_NOTE, "\n");
2803 /* Link the found element into the group list. */
2804 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2806 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2807 lastinfo = stmtinfo_a;
2809 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2810 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2811 lastinfo = stmtinfo_b;
2815 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2816 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2817 && !vect_analyze_data_ref_access (dr))
2819 if (dump_enabled_p ())
2820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2821 "not vectorized: complicated access pattern.\n");
2823 if (is_a <bb_vec_info> (vinfo))
2825 /* Mark the statement as not vectorizable. */
2826 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2827 continue;
2829 else
2831 datarefs_copy.release ();
2832 return false;
2836 datarefs_copy.release ();
2837 return true;
2841 /* Operator == between two dr_with_seg_len objects.
2843 This equality operator is used to make sure two data refs
2844 are the same one so that we will consider to combine the
2845 aliasing checks of those two pairs of data dependent data
2846 refs. */
2848 static bool
2849 operator == (const dr_with_seg_len& d1,
2850 const dr_with_seg_len& d2)
2852 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2853 DR_BASE_ADDRESS (d2.dr), 0)
2854 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2855 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2856 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2859 /* Function comp_dr_with_seg_len_pair.
2861 Comparison function for sorting objects of dr_with_seg_len_pair_t
2862 so that we can combine aliasing checks in one scan. */
2864 static int
2865 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2867 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2868 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2869 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2870 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2872 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2873 if a and c have the same basic address snd step, and b and d have the same
2874 address and step. Therefore, if any a&c or b&d don't have the same address
2875 and step, we don't care the order of those two pairs after sorting. */
2876 int comp_res;
2878 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2879 DR_BASE_ADDRESS (b1.dr))) != 0)
2880 return comp_res;
2881 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2882 DR_BASE_ADDRESS (b2.dr))) != 0)
2883 return comp_res;
2884 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2885 return comp_res;
2886 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2887 return comp_res;
2888 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2889 return comp_res;
2890 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2891 return comp_res;
2892 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2893 return comp_res;
2894 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2895 return comp_res;
2897 return 0;
2900 /* Function vect_vfa_segment_size.
2902 Create an expression that computes the size of segment
2903 that will be accessed for a data reference. The functions takes into
2904 account that realignment loads may access one more vector.
2906 Input:
2907 DR: The data reference.
2908 LENGTH_FACTOR: segment length to consider.
2910 Return an expression whose value is the size of segment which will be
2911 accessed by DR. */
2913 static tree
2914 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2916 tree segment_length;
2918 if (integer_zerop (DR_STEP (dr)))
2919 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2920 else
2921 segment_length = size_binop (MULT_EXPR,
2922 fold_convert (sizetype, DR_STEP (dr)),
2923 fold_convert (sizetype, length_factor));
2925 if (vect_supportable_dr_alignment (dr, false)
2926 == dr_explicit_realign_optimized)
2928 tree vector_size = TYPE_SIZE_UNIT
2929 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2931 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2933 return segment_length;
2936 /* Function vect_prune_runtime_alias_test_list.
2938 Prune a list of ddrs to be tested at run-time by versioning for alias.
2939 Merge several alias checks into one if possible.
2940 Return FALSE if resulting list of ddrs is longer then allowed by
2941 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2943 bool
2944 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2946 vec<ddr_p> may_alias_ddrs =
2947 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2948 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2949 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2950 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2951 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2953 ddr_p ddr;
2954 unsigned int i;
2955 tree length_factor;
2957 if (dump_enabled_p ())
2958 dump_printf_loc (MSG_NOTE, vect_location,
2959 "=== vect_prune_runtime_alias_test_list ===\n");
2961 if (may_alias_ddrs.is_empty ())
2962 return true;
2964 /* Basically, for each pair of dependent data refs store_ptr_0
2965 and load_ptr_0, we create an expression:
2967 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2968 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2970 for aliasing checks. However, in some cases we can decrease
2971 the number of checks by combining two checks into one. For
2972 example, suppose we have another pair of data refs store_ptr_0
2973 and load_ptr_1, and if the following condition is satisfied:
2975 load_ptr_0 < load_ptr_1 &&
2976 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2978 (this condition means, in each iteration of vectorized loop,
2979 the accessed memory of store_ptr_0 cannot be between the memory
2980 of load_ptr_0 and load_ptr_1.)
2982 we then can use only the following expression to finish the
2983 alising checks between store_ptr_0 & load_ptr_0 and
2984 store_ptr_0 & load_ptr_1:
2986 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2987 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2989 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2990 same basic address. */
2992 comp_alias_ddrs.create (may_alias_ddrs.length ());
2994 /* First, we collect all data ref pairs for aliasing checks. */
2995 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2997 int comp_res;
2998 struct data_reference *dr_a, *dr_b;
2999 gimple *dr_group_first_a, *dr_group_first_b;
3000 tree segment_length_a, segment_length_b;
3001 gimple *stmt_a, *stmt_b;
3003 dr_a = DDR_A (ddr);
3004 stmt_a = DR_STMT (DDR_A (ddr));
3005 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3006 if (dr_group_first_a)
3008 stmt_a = dr_group_first_a;
3009 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3012 dr_b = DDR_B (ddr);
3013 stmt_b = DR_STMT (DDR_B (ddr));
3014 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3015 if (dr_group_first_b)
3017 stmt_b = dr_group_first_b;
3018 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3021 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3022 length_factor = scalar_loop_iters;
3023 else
3024 length_factor = size_int (vect_factor);
3025 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3026 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3028 dr_with_seg_len_pair_t dr_with_seg_len_pair
3029 (dr_with_seg_len (dr_a, segment_length_a),
3030 dr_with_seg_len (dr_b, segment_length_b));
3032 /* Canonicalize pairs by sorting the two DR members. */
3033 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3034 if (comp_res > 0
3035 || (comp_res == 0
3036 && compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)) > 0))
3037 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3039 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3042 /* Second, we sort the collected data ref pairs so that we can scan
3043 them once to combine all possible aliasing checks. */
3044 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3046 /* Third, we scan the sorted dr pairs and check if we can combine
3047 alias checks of two neighboring dr pairs. */
3048 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3050 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3051 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3052 *dr_b1 = &comp_alias_ddrs[i-1].second,
3053 *dr_a2 = &comp_alias_ddrs[i].first,
3054 *dr_b2 = &comp_alias_ddrs[i].second;
3056 /* Remove duplicate data ref pairs. */
3057 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3059 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_NOTE, vect_location,
3062 "found equal ranges ");
3063 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3064 DR_REF (dr_a1->dr));
3065 dump_printf (MSG_NOTE, ", ");
3066 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3067 DR_REF (dr_b1->dr));
3068 dump_printf (MSG_NOTE, " and ");
3069 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3070 DR_REF (dr_a2->dr));
3071 dump_printf (MSG_NOTE, ", ");
3072 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3073 DR_REF (dr_b2->dr));
3074 dump_printf (MSG_NOTE, "\n");
3077 comp_alias_ddrs.ordered_remove (i--);
3078 continue;
3081 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3083 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3084 and DR_A1 and DR_A2 are two consecutive memrefs. */
3085 if (*dr_a1 == *dr_a2)
3087 std::swap (dr_a1, dr_b1);
3088 std::swap (dr_a2, dr_b2);
3091 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3092 DR_BASE_ADDRESS (dr_a2->dr), 0)
3093 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3094 DR_OFFSET (dr_a2->dr), 0)
3095 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3096 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3097 continue;
3099 /* Make sure dr_a1 starts left of dr_a2. */
3100 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3101 std::swap (*dr_a1, *dr_a2);
3103 bool do_remove = false;
3104 unsigned HOST_WIDE_INT diff
3105 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3106 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3108 /* If the left segment does not extend beyond the start of the
3109 right segment the new segment length is that of the right
3110 plus the segment distance. */
3111 if (tree_fits_uhwi_p (dr_a1->seg_len)
3112 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3114 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3115 size_int (diff));
3116 do_remove = true;
3118 /* Generally the new segment length is the maximum of the
3119 left segment size and the right segment size plus the distance.
3120 ??? We can also build tree MAX_EXPR here but it's not clear this
3121 is profitable. */
3122 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3123 && tree_fits_uhwi_p (dr_a2->seg_len))
3125 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3126 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3127 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3128 do_remove = true;
3130 /* Now we check if the following condition is satisfied:
3132 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3134 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3135 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3136 have to make a best estimation. We can get the minimum value
3137 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3138 then either of the following two conditions can guarantee the
3139 one above:
3141 1: DIFF <= MIN_SEG_LEN_B
3142 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3143 else
3145 unsigned HOST_WIDE_INT min_seg_len_b
3146 = (tree_fits_uhwi_p (dr_b1->seg_len)
3147 ? tree_to_uhwi (dr_b1->seg_len)
3148 : vect_factor);
3150 if (diff <= min_seg_len_b
3151 || (tree_fits_uhwi_p (dr_a1->seg_len)
3152 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3154 dr_a1->seg_len = size_binop (PLUS_EXPR,
3155 dr_a2->seg_len, size_int (diff));
3156 do_remove = true;
3160 if (do_remove)
3162 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_NOTE, vect_location,
3165 "merging ranges for ");
3166 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3167 dump_printf (MSG_NOTE, ", ");
3168 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3169 dump_printf (MSG_NOTE, " and ");
3170 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3171 dump_printf (MSG_NOTE, ", ");
3172 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3173 dump_printf (MSG_NOTE, "\n");
3175 comp_alias_ddrs.ordered_remove (i--);
3180 dump_printf_loc (MSG_NOTE, vect_location,
3181 "improved number of alias checks from %d to %d\n",
3182 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3183 if ((int) comp_alias_ddrs.length () >
3184 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3185 return false;
3187 return true;
3190 /* Check whether a non-affine read or write in stmt is suitable for gather load
3191 or scatter store and if so, return a builtin decl for that operation. */
3193 tree
3194 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3195 tree *offp, int *scalep)
3197 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3198 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3199 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3200 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3201 tree offtype = NULL_TREE;
3202 tree decl, base, off;
3203 machine_mode pmode;
3204 int punsignedp, reversep, pvolatilep = 0;
3206 base = DR_REF (dr);
3207 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3208 see if we can use the def stmt of the address. */
3209 if (is_gimple_call (stmt)
3210 && gimple_call_internal_p (stmt)
3211 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3212 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3213 && TREE_CODE (base) == MEM_REF
3214 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3215 && integer_zerop (TREE_OPERAND (base, 1))
3216 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3218 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3219 if (is_gimple_assign (def_stmt)
3220 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3221 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3224 /* The gather and scatter builtins need address of the form
3225 loop_invariant + vector * {1, 2, 4, 8}
3227 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3228 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3229 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3230 multiplications and additions in it. To get a vector, we need
3231 a single SSA_NAME that will be defined in the loop and will
3232 contain everything that is not loop invariant and that can be
3233 vectorized. The following code attempts to find such a preexistng
3234 SSA_NAME OFF and put the loop invariants into a tree BASE
3235 that can be gimplified before the loop. */
3236 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3237 &punsignedp, &reversep, &pvolatilep, false);
3238 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3240 if (TREE_CODE (base) == MEM_REF)
3242 if (!integer_zerop (TREE_OPERAND (base, 1)))
3244 if (off == NULL_TREE)
3246 offset_int moff = mem_ref_offset (base);
3247 off = wide_int_to_tree (sizetype, moff);
3249 else
3250 off = size_binop (PLUS_EXPR, off,
3251 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3253 base = TREE_OPERAND (base, 0);
3255 else
3256 base = build_fold_addr_expr (base);
3258 if (off == NULL_TREE)
3259 off = size_zero_node;
3261 /* If base is not loop invariant, either off is 0, then we start with just
3262 the constant offset in the loop invariant BASE and continue with base
3263 as OFF, otherwise give up.
3264 We could handle that case by gimplifying the addition of base + off
3265 into some SSA_NAME and use that as off, but for now punt. */
3266 if (!expr_invariant_in_loop_p (loop, base))
3268 if (!integer_zerop (off))
3269 return NULL_TREE;
3270 off = base;
3271 base = size_int (pbitpos / BITS_PER_UNIT);
3273 /* Otherwise put base + constant offset into the loop invariant BASE
3274 and continue with OFF. */
3275 else
3277 base = fold_convert (sizetype, base);
3278 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3281 /* OFF at this point may be either a SSA_NAME or some tree expression
3282 from get_inner_reference. Try to peel off loop invariants from it
3283 into BASE as long as possible. */
3284 STRIP_NOPS (off);
3285 while (offtype == NULL_TREE)
3287 enum tree_code code;
3288 tree op0, op1, add = NULL_TREE;
3290 if (TREE_CODE (off) == SSA_NAME)
3292 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3294 if (expr_invariant_in_loop_p (loop, off))
3295 return NULL_TREE;
3297 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3298 break;
3300 op0 = gimple_assign_rhs1 (def_stmt);
3301 code = gimple_assign_rhs_code (def_stmt);
3302 op1 = gimple_assign_rhs2 (def_stmt);
3304 else
3306 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3307 return NULL_TREE;
3308 code = TREE_CODE (off);
3309 extract_ops_from_tree (off, &code, &op0, &op1);
3311 switch (code)
3313 case POINTER_PLUS_EXPR:
3314 case PLUS_EXPR:
3315 if (expr_invariant_in_loop_p (loop, op0))
3317 add = op0;
3318 off = op1;
3319 do_add:
3320 add = fold_convert (sizetype, add);
3321 if (scale != 1)
3322 add = size_binop (MULT_EXPR, add, size_int (scale));
3323 base = size_binop (PLUS_EXPR, base, add);
3324 continue;
3326 if (expr_invariant_in_loop_p (loop, op1))
3328 add = op1;
3329 off = op0;
3330 goto do_add;
3332 break;
3333 case MINUS_EXPR:
3334 if (expr_invariant_in_loop_p (loop, op1))
3336 add = fold_convert (sizetype, op1);
3337 add = size_binop (MINUS_EXPR, size_zero_node, add);
3338 off = op0;
3339 goto do_add;
3341 break;
3342 case MULT_EXPR:
3343 if (scale == 1 && tree_fits_shwi_p (op1))
3345 scale = tree_to_shwi (op1);
3346 off = op0;
3347 continue;
3349 break;
3350 case SSA_NAME:
3351 off = op0;
3352 continue;
3353 CASE_CONVERT:
3354 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3355 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3356 break;
3357 if (TYPE_PRECISION (TREE_TYPE (op0))
3358 == TYPE_PRECISION (TREE_TYPE (off)))
3360 off = op0;
3361 continue;
3363 if (TYPE_PRECISION (TREE_TYPE (op0))
3364 < TYPE_PRECISION (TREE_TYPE (off)))
3366 off = op0;
3367 offtype = TREE_TYPE (off);
3368 STRIP_NOPS (off);
3369 continue;
3371 break;
3372 default:
3373 break;
3375 break;
3378 /* If at the end OFF still isn't a SSA_NAME or isn't
3379 defined in the loop, punt. */
3380 if (TREE_CODE (off) != SSA_NAME
3381 || expr_invariant_in_loop_p (loop, off))
3382 return NULL_TREE;
3384 if (offtype == NULL_TREE)
3385 offtype = TREE_TYPE (off);
3387 if (DR_IS_READ (dr))
3388 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3389 offtype, scale);
3390 else
3391 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3392 offtype, scale);
3394 if (decl == NULL_TREE)
3395 return NULL_TREE;
3397 if (basep)
3398 *basep = base;
3399 if (offp)
3400 *offp = off;
3401 if (scalep)
3402 *scalep = scale;
3403 return decl;
3406 /* Function vect_analyze_data_refs.
3408 Find all the data references in the loop or basic block.
3410 The general structure of the analysis of data refs in the vectorizer is as
3411 follows:
3412 1- vect_analyze_data_refs(loop/bb): call
3413 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3414 in the loop/bb and their dependences.
3415 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3416 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3417 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3421 bool
3422 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3424 struct loop *loop = NULL;
3425 unsigned int i;
3426 struct data_reference *dr;
3427 tree scalar_type;
3429 if (dump_enabled_p ())
3430 dump_printf_loc (MSG_NOTE, vect_location,
3431 "=== vect_analyze_data_refs ===\n");
3433 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3434 loop = LOOP_VINFO_LOOP (loop_vinfo);
3436 /* Go through the data-refs, check that the analysis succeeded. Update
3437 pointer from stmt_vec_info struct to DR and vectype. */
3439 vec<data_reference_p> datarefs = vinfo->datarefs;
3440 FOR_EACH_VEC_ELT (datarefs, i, dr)
3442 gimple *stmt;
3443 stmt_vec_info stmt_info;
3444 tree base, offset, init;
3445 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3446 bool simd_lane_access = false;
3447 int vf;
3449 again:
3450 if (!dr || !DR_REF (dr))
3452 if (dump_enabled_p ())
3453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3454 "not vectorized: unhandled data-ref\n");
3455 return false;
3458 stmt = DR_STMT (dr);
3459 stmt_info = vinfo_for_stmt (stmt);
3461 /* Discard clobbers from the dataref vector. We will remove
3462 clobber stmts during vectorization. */
3463 if (gimple_clobber_p (stmt))
3465 free_data_ref (dr);
3466 if (i == datarefs.length () - 1)
3468 datarefs.pop ();
3469 break;
3471 datarefs.ordered_remove (i);
3472 dr = datarefs[i];
3473 goto again;
3476 /* Check that analysis of the data-ref succeeded. */
3477 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3478 || !DR_STEP (dr))
3480 bool maybe_gather
3481 = DR_IS_READ (dr)
3482 && !TREE_THIS_VOLATILE (DR_REF (dr))
3483 && targetm.vectorize.builtin_gather != NULL;
3484 bool maybe_scatter
3485 = DR_IS_WRITE (dr)
3486 && !TREE_THIS_VOLATILE (DR_REF (dr))
3487 && targetm.vectorize.builtin_scatter != NULL;
3488 bool maybe_simd_lane_access
3489 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3491 /* If target supports vector gather loads or scatter stores, or if
3492 this might be a SIMD lane access, see if they can't be used. */
3493 if (is_a <loop_vec_info> (vinfo)
3494 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3495 && !nested_in_vect_loop_p (loop, stmt))
3497 struct data_reference *newdr
3498 = create_data_ref (NULL, loop_containing_stmt (stmt),
3499 DR_REF (dr), stmt, maybe_scatter ? false : true);
3500 gcc_assert (newdr != NULL && DR_REF (newdr));
3501 if (DR_BASE_ADDRESS (newdr)
3502 && DR_OFFSET (newdr)
3503 && DR_INIT (newdr)
3504 && DR_STEP (newdr)
3505 && integer_zerop (DR_STEP (newdr)))
3507 if (maybe_simd_lane_access)
3509 tree off = DR_OFFSET (newdr);
3510 STRIP_NOPS (off);
3511 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3512 && TREE_CODE (off) == MULT_EXPR
3513 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3515 tree step = TREE_OPERAND (off, 1);
3516 off = TREE_OPERAND (off, 0);
3517 STRIP_NOPS (off);
3518 if (CONVERT_EXPR_P (off)
3519 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3520 0)))
3521 < TYPE_PRECISION (TREE_TYPE (off)))
3522 off = TREE_OPERAND (off, 0);
3523 if (TREE_CODE (off) == SSA_NAME)
3525 gimple *def = SSA_NAME_DEF_STMT (off);
3526 tree reft = TREE_TYPE (DR_REF (newdr));
3527 if (is_gimple_call (def)
3528 && gimple_call_internal_p (def)
3529 && (gimple_call_internal_fn (def)
3530 == IFN_GOMP_SIMD_LANE))
3532 tree arg = gimple_call_arg (def, 0);
3533 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3534 arg = SSA_NAME_VAR (arg);
3535 if (arg == loop->simduid
3536 /* For now. */
3537 && tree_int_cst_equal
3538 (TYPE_SIZE_UNIT (reft),
3539 step))
3541 DR_OFFSET (newdr) = ssize_int (0);
3542 DR_STEP (newdr) = step;
3543 DR_ALIGNED_TO (newdr)
3544 = size_int (BIGGEST_ALIGNMENT);
3545 dr = newdr;
3546 simd_lane_access = true;
3552 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3554 dr = newdr;
3555 if (maybe_gather)
3556 gatherscatter = GATHER;
3557 else
3558 gatherscatter = SCATTER;
3561 if (gatherscatter == SG_NONE && !simd_lane_access)
3562 free_data_ref (newdr);
3565 if (gatherscatter == SG_NONE && !simd_lane_access)
3567 if (dump_enabled_p ())
3569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3570 "not vectorized: data ref analysis "
3571 "failed ");
3572 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3575 if (is_a <bb_vec_info> (vinfo))
3576 break;
3578 return false;
3582 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3584 if (dump_enabled_p ())
3585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586 "not vectorized: base addr of dr is a "
3587 "constant\n");
3589 if (is_a <bb_vec_info> (vinfo))
3590 break;
3592 if (gatherscatter != SG_NONE || simd_lane_access)
3593 free_data_ref (dr);
3594 return false;
3597 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3599 if (dump_enabled_p ())
3601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3602 "not vectorized: volatile type ");
3603 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3606 if (is_a <bb_vec_info> (vinfo))
3607 break;
3609 return false;
3612 if (stmt_can_throw_internal (stmt))
3614 if (dump_enabled_p ())
3616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3617 "not vectorized: statement can throw an "
3618 "exception ");
3619 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3622 if (is_a <bb_vec_info> (vinfo))
3623 break;
3625 if (gatherscatter != SG_NONE || simd_lane_access)
3626 free_data_ref (dr);
3627 return false;
3630 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3631 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3633 if (dump_enabled_p ())
3635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3636 "not vectorized: statement is bitfield "
3637 "access ");
3638 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3641 if (is_a <bb_vec_info> (vinfo))
3642 break;
3644 if (gatherscatter != SG_NONE || simd_lane_access)
3645 free_data_ref (dr);
3646 return false;
3649 base = unshare_expr (DR_BASE_ADDRESS (dr));
3650 offset = unshare_expr (DR_OFFSET (dr));
3651 init = unshare_expr (DR_INIT (dr));
3653 if (is_gimple_call (stmt)
3654 && (!gimple_call_internal_p (stmt)
3655 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3656 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3658 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3661 "not vectorized: dr in a call ");
3662 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3665 if (is_a <bb_vec_info> (vinfo))
3666 break;
3668 if (gatherscatter != SG_NONE || simd_lane_access)
3669 free_data_ref (dr);
3670 return false;
3673 /* Update DR field in stmt_vec_info struct. */
3675 /* If the dataref is in an inner-loop of the loop that is considered for
3676 for vectorization, we also want to analyze the access relative to
3677 the outer-loop (DR contains information only relative to the
3678 inner-most enclosing loop). We do that by building a reference to the
3679 first location accessed by the inner-loop, and analyze it relative to
3680 the outer-loop. */
3681 if (loop && nested_in_vect_loop_p (loop, stmt))
3683 tree outer_step, outer_base, outer_init;
3684 HOST_WIDE_INT pbitsize, pbitpos;
3685 tree poffset;
3686 machine_mode pmode;
3687 int punsignedp, preversep, pvolatilep;
3688 affine_iv base_iv, offset_iv;
3689 tree dinit;
3691 /* Build a reference to the first location accessed by the
3692 inner-loop: *(BASE+INIT). (The first location is actually
3693 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3694 tree inner_base = build_fold_indirect_ref
3695 (fold_build_pointer_plus (base, init));
3697 if (dump_enabled_p ())
3699 dump_printf_loc (MSG_NOTE, vect_location,
3700 "analyze in outer-loop: ");
3701 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3702 dump_printf (MSG_NOTE, "\n");
3705 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3706 &poffset, &pmode, &punsignedp,
3707 &preversep, &pvolatilep, false);
3708 gcc_assert (outer_base != NULL_TREE);
3710 if (pbitpos % BITS_PER_UNIT != 0)
3712 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3714 "failed: bit offset alignment.\n");
3715 return false;
3718 if (preversep)
3720 if (dump_enabled_p ())
3721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3722 "failed: reverse storage order.\n");
3723 return false;
3726 outer_base = build_fold_addr_expr (outer_base);
3727 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3728 &base_iv, false))
3730 if (dump_enabled_p ())
3731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3732 "failed: evolution of base is not affine.\n");
3733 return false;
3736 if (offset)
3738 if (poffset)
3739 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3740 poffset);
3741 else
3742 poffset = offset;
3745 if (!poffset)
3747 offset_iv.base = ssize_int (0);
3748 offset_iv.step = ssize_int (0);
3750 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3751 &offset_iv, false))
3753 if (dump_enabled_p ())
3754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3755 "evolution of offset is not affine.\n");
3756 return false;
3759 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3760 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3761 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3762 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3763 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3765 outer_step = size_binop (PLUS_EXPR,
3766 fold_convert (ssizetype, base_iv.step),
3767 fold_convert (ssizetype, offset_iv.step));
3769 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3770 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3771 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3772 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3773 STMT_VINFO_DR_OFFSET (stmt_info) =
3774 fold_convert (ssizetype, offset_iv.base);
3775 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3776 size_int (highest_pow2_factor (offset_iv.base));
3778 if (dump_enabled_p ())
3780 dump_printf_loc (MSG_NOTE, vect_location,
3781 "\touter base_address: ");
3782 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3783 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3784 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3785 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3786 STMT_VINFO_DR_OFFSET (stmt_info));
3787 dump_printf (MSG_NOTE,
3788 "\n\touter constant offset from base address: ");
3789 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3790 STMT_VINFO_DR_INIT (stmt_info));
3791 dump_printf (MSG_NOTE, "\n\touter step: ");
3792 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3793 STMT_VINFO_DR_STEP (stmt_info));
3794 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3795 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3796 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3797 dump_printf (MSG_NOTE, "\n");
3801 if (STMT_VINFO_DATA_REF (stmt_info))
3803 if (dump_enabled_p ())
3805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3806 "not vectorized: more than one data ref "
3807 "in stmt: ");
3808 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3811 if (is_a <bb_vec_info> (vinfo))
3812 break;
3814 if (gatherscatter != SG_NONE || simd_lane_access)
3815 free_data_ref (dr);
3816 return false;
3819 STMT_VINFO_DATA_REF (stmt_info) = dr;
3820 if (simd_lane_access)
3822 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3823 free_data_ref (datarefs[i]);
3824 datarefs[i] = dr;
3827 /* Set vectype for STMT. */
3828 scalar_type = TREE_TYPE (DR_REF (dr));
3829 STMT_VINFO_VECTYPE (stmt_info)
3830 = get_vectype_for_scalar_type (scalar_type);
3831 if (!STMT_VINFO_VECTYPE (stmt_info))
3833 if (dump_enabled_p ())
3835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3836 "not vectorized: no vectype for stmt: ");
3837 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3838 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3839 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3840 scalar_type);
3841 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3844 if (is_a <bb_vec_info> (vinfo))
3846 /* No vector type is fine, the ref can still participate
3847 in dependence analysis, we just can't vectorize it. */
3848 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3849 continue;
3852 if (gatherscatter != SG_NONE || simd_lane_access)
3854 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3855 if (gatherscatter != SG_NONE)
3856 free_data_ref (dr);
3858 return false;
3860 else
3862 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_NOTE, vect_location,
3865 "got vectype for stmt: ");
3866 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3867 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3868 STMT_VINFO_VECTYPE (stmt_info));
3869 dump_printf (MSG_NOTE, "\n");
3873 /* Adjust the minimal vectorization factor according to the
3874 vector type. */
3875 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3876 if (vf > *min_vf)
3877 *min_vf = vf;
3879 if (gatherscatter != SG_NONE)
3881 tree off;
3882 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3883 NULL, &off, NULL)
3884 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3886 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3887 free_data_ref (dr);
3888 if (dump_enabled_p ())
3890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3891 (gatherscatter == GATHER) ?
3892 "not vectorized: not suitable for gather "
3893 "load " :
3894 "not vectorized: not suitable for scatter "
3895 "store ");
3896 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3898 return false;
3901 free_data_ref (datarefs[i]);
3902 datarefs[i] = dr;
3903 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3906 else if (is_a <loop_vec_info> (vinfo)
3907 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3909 if (nested_in_vect_loop_p (loop, stmt))
3911 if (dump_enabled_p ())
3913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3914 "not vectorized: not suitable for strided "
3915 "load ");
3916 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3918 return false;
3920 STMT_VINFO_STRIDED_P (stmt_info) = true;
3924 /* If we stopped analysis at the first dataref we could not analyze
3925 when trying to vectorize a basic-block mark the rest of the datarefs
3926 as not vectorizable and truncate the vector of datarefs. That
3927 avoids spending useless time in analyzing their dependence. */
3928 if (i != datarefs.length ())
3930 gcc_assert (is_a <bb_vec_info> (vinfo));
3931 for (unsigned j = i; j < datarefs.length (); ++j)
3933 data_reference_p dr = datarefs[j];
3934 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3935 free_data_ref (dr);
3937 datarefs.truncate (i);
3940 return true;
3944 /* Function vect_get_new_vect_var.
3946 Returns a name for a new variable. The current naming scheme appends the
3947 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3948 the name of vectorizer generated variables, and appends that to NAME if
3949 provided. */
3951 tree
3952 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3954 const char *prefix;
3955 tree new_vect_var;
3957 switch (var_kind)
3959 case vect_simple_var:
3960 prefix = "vect";
3961 break;
3962 case vect_scalar_var:
3963 prefix = "stmp";
3964 break;
3965 case vect_mask_var:
3966 prefix = "mask";
3967 break;
3968 case vect_pointer_var:
3969 prefix = "vectp";
3970 break;
3971 default:
3972 gcc_unreachable ();
3975 if (name)
3977 char* tmp = concat (prefix, "_", name, NULL);
3978 new_vect_var = create_tmp_reg (type, tmp);
3979 free (tmp);
3981 else
3982 new_vect_var = create_tmp_reg (type, prefix);
3984 return new_vect_var;
3987 /* Like vect_get_new_vect_var but return an SSA name. */
3989 tree
3990 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3992 const char *prefix;
3993 tree new_vect_var;
3995 switch (var_kind)
3997 case vect_simple_var:
3998 prefix = "vect";
3999 break;
4000 case vect_scalar_var:
4001 prefix = "stmp";
4002 break;
4003 case vect_pointer_var:
4004 prefix = "vectp";
4005 break;
4006 default:
4007 gcc_unreachable ();
4010 if (name)
4012 char* tmp = concat (prefix, "_", name, NULL);
4013 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4014 free (tmp);
4016 else
4017 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4019 return new_vect_var;
4022 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4024 static void
4025 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4026 stmt_vec_info stmt_info)
4028 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4029 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4030 int misalign = DR_MISALIGNMENT (dr);
4031 if (misalign == -1)
4032 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4033 else
4034 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4037 /* Function vect_create_addr_base_for_vector_ref.
4039 Create an expression that computes the address of the first memory location
4040 that will be accessed for a data reference.
4042 Input:
4043 STMT: The statement containing the data reference.
4044 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4045 OFFSET: Optional. If supplied, it is be added to the initial address.
4046 LOOP: Specify relative to which loop-nest should the address be computed.
4047 For example, when the dataref is in an inner-loop nested in an
4048 outer-loop that is now being vectorized, LOOP can be either the
4049 outer-loop, or the inner-loop. The first memory location accessed
4050 by the following dataref ('in' points to short):
4052 for (i=0; i<N; i++)
4053 for (j=0; j<M; j++)
4054 s += in[i+j]
4056 is as follows:
4057 if LOOP=i_loop: &in (relative to i_loop)
4058 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4059 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4060 initial address. Unlike OFFSET, which is number of elements to
4061 be added, BYTE_OFFSET is measured in bytes.
4063 Output:
4064 1. Return an SSA_NAME whose value is the address of the memory location of
4065 the first vector of the data reference.
4066 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4067 these statement(s) which define the returned SSA_NAME.
4069 FORNOW: We are only handling array accesses with step 1. */
4071 tree
4072 vect_create_addr_base_for_vector_ref (gimple *stmt,
4073 gimple_seq *new_stmt_list,
4074 tree offset,
4075 struct loop *loop,
4076 tree byte_offset)
4078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4079 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4080 tree data_ref_base;
4081 const char *base_name;
4082 tree addr_base;
4083 tree dest;
4084 gimple_seq seq = NULL;
4085 tree base_offset;
4086 tree init;
4087 tree vect_ptr_type;
4088 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4089 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4091 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4093 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4095 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4097 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4098 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4099 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4101 else
4103 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4104 base_offset = unshare_expr (DR_OFFSET (dr));
4105 init = unshare_expr (DR_INIT (dr));
4108 if (loop_vinfo)
4109 base_name = get_name (data_ref_base);
4110 else
4112 base_offset = ssize_int (0);
4113 init = ssize_int (0);
4114 base_name = get_name (DR_REF (dr));
4117 /* Create base_offset */
4118 base_offset = size_binop (PLUS_EXPR,
4119 fold_convert (sizetype, base_offset),
4120 fold_convert (sizetype, init));
4122 if (offset)
4124 offset = fold_build2 (MULT_EXPR, sizetype,
4125 fold_convert (sizetype, offset), step);
4126 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4127 base_offset, offset);
4129 if (byte_offset)
4131 byte_offset = fold_convert (sizetype, byte_offset);
4132 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4133 base_offset, byte_offset);
4136 /* base + base_offset */
4137 if (loop_vinfo)
4138 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4139 else
4141 addr_base = build1 (ADDR_EXPR,
4142 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4143 unshare_expr (DR_REF (dr)));
4146 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4147 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4148 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4149 gimple_seq_add_seq (new_stmt_list, seq);
4151 if (DR_PTR_INFO (dr)
4152 && TREE_CODE (addr_base) == SSA_NAME
4153 && !SSA_NAME_PTR_INFO (addr_base))
4155 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4156 if (offset || byte_offset)
4157 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4160 if (dump_enabled_p ())
4162 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4163 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4164 dump_printf (MSG_NOTE, "\n");
4167 return addr_base;
4171 /* Function vect_create_data_ref_ptr.
4173 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4174 location accessed in the loop by STMT, along with the def-use update
4175 chain to appropriately advance the pointer through the loop iterations.
4176 Also set aliasing information for the pointer. This pointer is used by
4177 the callers to this function to create a memory reference expression for
4178 vector load/store access.
4180 Input:
4181 1. STMT: a stmt that references memory. Expected to be of the form
4182 GIMPLE_ASSIGN <name, data-ref> or
4183 GIMPLE_ASSIGN <data-ref, name>.
4184 2. AGGR_TYPE: the type of the reference, which should be either a vector
4185 or an array.
4186 3. AT_LOOP: the loop where the vector memref is to be created.
4187 4. OFFSET (optional): an offset to be added to the initial address accessed
4188 by the data-ref in STMT.
4189 5. BSI: location where the new stmts are to be placed if there is no loop
4190 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4191 pointing to the initial address.
4192 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4193 to the initial address accessed by the data-ref in STMT. This is
4194 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4195 in bytes.
4197 Output:
4198 1. Declare a new ptr to vector_type, and have it point to the base of the
4199 data reference (initial addressed accessed by the data reference).
4200 For example, for vector of type V8HI, the following code is generated:
4202 v8hi *ap;
4203 ap = (v8hi *)initial_address;
4205 if OFFSET is not supplied:
4206 initial_address = &a[init];
4207 if OFFSET is supplied:
4208 initial_address = &a[init + OFFSET];
4209 if BYTE_OFFSET is supplied:
4210 initial_address = &a[init] + BYTE_OFFSET;
4212 Return the initial_address in INITIAL_ADDRESS.
4214 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4215 update the pointer in each iteration of the loop.
4217 Return the increment stmt that updates the pointer in PTR_INCR.
4219 3. Set INV_P to true if the access pattern of the data reference in the
4220 vectorized loop is invariant. Set it to false otherwise.
4222 4. Return the pointer. */
4224 tree
4225 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4226 tree offset, tree *initial_address,
4227 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4228 bool only_init, bool *inv_p, tree byte_offset)
4230 const char *base_name;
4231 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4232 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4233 struct loop *loop = NULL;
4234 bool nested_in_vect_loop = false;
4235 struct loop *containing_loop = NULL;
4236 tree aggr_ptr_type;
4237 tree aggr_ptr;
4238 tree new_temp;
4239 gimple_seq new_stmt_list = NULL;
4240 edge pe = NULL;
4241 basic_block new_bb;
4242 tree aggr_ptr_init;
4243 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4244 tree aptr;
4245 gimple_stmt_iterator incr_gsi;
4246 bool insert_after;
4247 tree indx_before_incr, indx_after_incr;
4248 gimple *incr;
4249 tree step;
4250 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4252 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4253 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4255 if (loop_vinfo)
4257 loop = LOOP_VINFO_LOOP (loop_vinfo);
4258 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4259 containing_loop = (gimple_bb (stmt))->loop_father;
4260 pe = loop_preheader_edge (loop);
4262 else
4264 gcc_assert (bb_vinfo);
4265 only_init = true;
4266 *ptr_incr = NULL;
4269 /* Check the step (evolution) of the load in LOOP, and record
4270 whether it's invariant. */
4271 if (nested_in_vect_loop)
4272 step = STMT_VINFO_DR_STEP (stmt_info);
4273 else
4274 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4276 if (integer_zerop (step))
4277 *inv_p = true;
4278 else
4279 *inv_p = false;
4281 /* Create an expression for the first address accessed by this load
4282 in LOOP. */
4283 base_name = get_name (DR_BASE_ADDRESS (dr));
4285 if (dump_enabled_p ())
4287 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4288 dump_printf_loc (MSG_NOTE, vect_location,
4289 "create %s-pointer variable to type: ",
4290 get_tree_code_name (TREE_CODE (aggr_type)));
4291 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4292 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4293 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4294 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4295 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4296 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4297 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4298 else
4299 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4300 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4301 dump_printf (MSG_NOTE, "\n");
4304 /* (1) Create the new aggregate-pointer variable.
4305 Vector and array types inherit the alias set of their component
4306 type by default so we need to use a ref-all pointer if the data
4307 reference does not conflict with the created aggregated data
4308 reference because it is not addressable. */
4309 bool need_ref_all = false;
4310 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4311 get_alias_set (DR_REF (dr))))
4312 need_ref_all = true;
4313 /* Likewise for any of the data references in the stmt group. */
4314 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4316 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4319 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4320 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4321 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4322 get_alias_set (DR_REF (sdr))))
4324 need_ref_all = true;
4325 break;
4327 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4329 while (orig_stmt);
4331 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4332 need_ref_all);
4333 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4336 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4337 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4338 def-use update cycles for the pointer: one relative to the outer-loop
4339 (LOOP), which is what steps (3) and (4) below do. The other is relative
4340 to the inner-loop (which is the inner-most loop containing the dataref),
4341 and this is done be step (5) below.
4343 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4344 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4345 redundant. Steps (3),(4) create the following:
4347 vp0 = &base_addr;
4348 LOOP: vp1 = phi(vp0,vp2)
4351 vp2 = vp1 + step
4352 goto LOOP
4354 If there is an inner-loop nested in loop, then step (5) will also be
4355 applied, and an additional update in the inner-loop will be created:
4357 vp0 = &base_addr;
4358 LOOP: vp1 = phi(vp0,vp2)
4360 inner: vp3 = phi(vp1,vp4)
4361 vp4 = vp3 + inner_step
4362 if () goto inner
4364 vp2 = vp1 + step
4365 if () goto LOOP */
4367 /* (2) Calculate the initial address of the aggregate-pointer, and set
4368 the aggregate-pointer to point to it before the loop. */
4370 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4372 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4373 offset, loop, byte_offset);
4374 if (new_stmt_list)
4376 if (pe)
4378 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4379 gcc_assert (!new_bb);
4381 else
4382 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4385 *initial_address = new_temp;
4386 aggr_ptr_init = new_temp;
4388 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4389 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4390 inner-loop nested in LOOP (during outer-loop vectorization). */
4392 /* No update in loop is required. */
4393 if (only_init && (!loop_vinfo || at_loop == loop))
4394 aptr = aggr_ptr_init;
4395 else
4397 /* The step of the aggregate pointer is the type size. */
4398 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4399 /* One exception to the above is when the scalar step of the load in
4400 LOOP is zero. In this case the step here is also zero. */
4401 if (*inv_p)
4402 iv_step = size_zero_node;
4403 else if (tree_int_cst_sgn (step) == -1)
4404 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4406 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4408 create_iv (aggr_ptr_init,
4409 fold_convert (aggr_ptr_type, iv_step),
4410 aggr_ptr, loop, &incr_gsi, insert_after,
4411 &indx_before_incr, &indx_after_incr);
4412 incr = gsi_stmt (incr_gsi);
4413 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4415 /* Copy the points-to information if it exists. */
4416 if (DR_PTR_INFO (dr))
4418 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4419 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4421 if (ptr_incr)
4422 *ptr_incr = incr;
4424 aptr = indx_before_incr;
4427 if (!nested_in_vect_loop || only_init)
4428 return aptr;
4431 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4432 nested in LOOP, if exists. */
4434 gcc_assert (nested_in_vect_loop);
4435 if (!only_init)
4437 standard_iv_increment_position (containing_loop, &incr_gsi,
4438 &insert_after);
4439 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4440 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4441 &indx_after_incr);
4442 incr = gsi_stmt (incr_gsi);
4443 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4445 /* Copy the points-to information if it exists. */
4446 if (DR_PTR_INFO (dr))
4448 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4449 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4451 if (ptr_incr)
4452 *ptr_incr = incr;
4454 return indx_before_incr;
4456 else
4457 gcc_unreachable ();
4461 /* Function bump_vector_ptr
4463 Increment a pointer (to a vector type) by vector-size. If requested,
4464 i.e. if PTR-INCR is given, then also connect the new increment stmt
4465 to the existing def-use update-chain of the pointer, by modifying
4466 the PTR_INCR as illustrated below:
4468 The pointer def-use update-chain before this function:
4469 DATAREF_PTR = phi (p_0, p_2)
4470 ....
4471 PTR_INCR: p_2 = DATAREF_PTR + step
4473 The pointer def-use update-chain after this function:
4474 DATAREF_PTR = phi (p_0, p_2)
4475 ....
4476 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4477 ....
4478 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4480 Input:
4481 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4482 in the loop.
4483 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4484 the loop. The increment amount across iterations is expected
4485 to be vector_size.
4486 BSI - location where the new update stmt is to be placed.
4487 STMT - the original scalar memory-access stmt that is being vectorized.
4488 BUMP - optional. The offset by which to bump the pointer. If not given,
4489 the offset is assumed to be vector_size.
4491 Output: Return NEW_DATAREF_PTR as illustrated above.
4495 tree
4496 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4497 gimple *stmt, tree bump)
4499 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4500 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4501 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4502 tree update = TYPE_SIZE_UNIT (vectype);
4503 gassign *incr_stmt;
4504 ssa_op_iter iter;
4505 use_operand_p use_p;
4506 tree new_dataref_ptr;
4508 if (bump)
4509 update = bump;
4511 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4512 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4513 else
4514 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4515 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4516 dataref_ptr, update);
4517 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4519 /* Copy the points-to information if it exists. */
4520 if (DR_PTR_INFO (dr))
4522 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4523 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4526 if (!ptr_incr)
4527 return new_dataref_ptr;
4529 /* Update the vector-pointer's cross-iteration increment. */
4530 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4532 tree use = USE_FROM_PTR (use_p);
4534 if (use == dataref_ptr)
4535 SET_USE (use_p, new_dataref_ptr);
4536 else
4537 gcc_assert (tree_int_cst_compare (use, update) == 0);
4540 return new_dataref_ptr;
4544 /* Function vect_create_destination_var.
4546 Create a new temporary of type VECTYPE. */
4548 tree
4549 vect_create_destination_var (tree scalar_dest, tree vectype)
4551 tree vec_dest;
4552 const char *name;
4553 char *new_name;
4554 tree type;
4555 enum vect_var_kind kind;
4557 kind = vectype
4558 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4559 ? vect_mask_var
4560 : vect_simple_var
4561 : vect_scalar_var;
4562 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4564 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4566 name = get_name (scalar_dest);
4567 if (name)
4568 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4569 else
4570 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4571 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4572 free (new_name);
4574 return vec_dest;
4577 /* Function vect_grouped_store_supported.
4579 Returns TRUE if interleave high and interleave low permutations
4580 are supported, and FALSE otherwise. */
4582 bool
4583 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4585 machine_mode mode = TYPE_MODE (vectype);
4587 /* vect_permute_store_chain requires the group size to be equal to 3 or
4588 be a power of two. */
4589 if (count != 3 && exact_log2 (count) == -1)
4591 if (dump_enabled_p ())
4592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4593 "the size of the group of accesses"
4594 " is not a power of 2 or not eqaul to 3\n");
4595 return false;
4598 /* Check that the permutation is supported. */
4599 if (VECTOR_MODE_P (mode))
4601 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4602 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4604 if (count == 3)
4606 unsigned int j0 = 0, j1 = 0, j2 = 0;
4607 unsigned int i, j;
4609 for (j = 0; j < 3; j++)
4611 int nelt0 = ((3 - j) * nelt) % 3;
4612 int nelt1 = ((3 - j) * nelt + 1) % 3;
4613 int nelt2 = ((3 - j) * nelt + 2) % 3;
4614 for (i = 0; i < nelt; i++)
4616 if (3 * i + nelt0 < nelt)
4617 sel[3 * i + nelt0] = j0++;
4618 if (3 * i + nelt1 < nelt)
4619 sel[3 * i + nelt1] = nelt + j1++;
4620 if (3 * i + nelt2 < nelt)
4621 sel[3 * i + nelt2] = 0;
4623 if (!can_vec_perm_p (mode, false, sel))
4625 if (dump_enabled_p ())
4626 dump_printf (MSG_MISSED_OPTIMIZATION,
4627 "permutaion op not supported by target.\n");
4628 return false;
4631 for (i = 0; i < nelt; i++)
4633 if (3 * i + nelt0 < nelt)
4634 sel[3 * i + nelt0] = 3 * i + nelt0;
4635 if (3 * i + nelt1 < nelt)
4636 sel[3 * i + nelt1] = 3 * i + nelt1;
4637 if (3 * i + nelt2 < nelt)
4638 sel[3 * i + nelt2] = nelt + j2++;
4640 if (!can_vec_perm_p (mode, false, sel))
4642 if (dump_enabled_p ())
4643 dump_printf (MSG_MISSED_OPTIMIZATION,
4644 "permutaion op not supported by target.\n");
4645 return false;
4648 return true;
4650 else
4652 /* If length is not equal to 3 then only power of 2 is supported. */
4653 gcc_assert (exact_log2 (count) != -1);
4655 for (i = 0; i < nelt / 2; i++)
4657 sel[i * 2] = i;
4658 sel[i * 2 + 1] = i + nelt;
4660 if (can_vec_perm_p (mode, false, sel))
4662 for (i = 0; i < nelt; i++)
4663 sel[i] += nelt / 2;
4664 if (can_vec_perm_p (mode, false, sel))
4665 return true;
4670 if (dump_enabled_p ())
4671 dump_printf (MSG_MISSED_OPTIMIZATION,
4672 "permutaion op not supported by target.\n");
4673 return false;
4677 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4678 type VECTYPE. */
4680 bool
4681 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4683 return vect_lanes_optab_supported_p ("vec_store_lanes",
4684 vec_store_lanes_optab,
4685 vectype, count);
4689 /* Function vect_permute_store_chain.
4691 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4692 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4693 the data correctly for the stores. Return the final references for stores
4694 in RESULT_CHAIN.
4696 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4697 The input is 4 vectors each containing 8 elements. We assign a number to
4698 each element, the input sequence is:
4700 1st vec: 0 1 2 3 4 5 6 7
4701 2nd vec: 8 9 10 11 12 13 14 15
4702 3rd vec: 16 17 18 19 20 21 22 23
4703 4th vec: 24 25 26 27 28 29 30 31
4705 The output sequence should be:
4707 1st vec: 0 8 16 24 1 9 17 25
4708 2nd vec: 2 10 18 26 3 11 19 27
4709 3rd vec: 4 12 20 28 5 13 21 30
4710 4th vec: 6 14 22 30 7 15 23 31
4712 i.e., we interleave the contents of the four vectors in their order.
4714 We use interleave_high/low instructions to create such output. The input of
4715 each interleave_high/low operation is two vectors:
4716 1st vec 2nd vec
4717 0 1 2 3 4 5 6 7
4718 the even elements of the result vector are obtained left-to-right from the
4719 high/low elements of the first vector. The odd elements of the result are
4720 obtained left-to-right from the high/low elements of the second vector.
4721 The output of interleave_high will be: 0 4 1 5
4722 and of interleave_low: 2 6 3 7
4725 The permutation is done in log LENGTH stages. In each stage interleave_high
4726 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4727 where the first argument is taken from the first half of DR_CHAIN and the
4728 second argument from it's second half.
4729 In our example,
4731 I1: interleave_high (1st vec, 3rd vec)
4732 I2: interleave_low (1st vec, 3rd vec)
4733 I3: interleave_high (2nd vec, 4th vec)
4734 I4: interleave_low (2nd vec, 4th vec)
4736 The output for the first stage is:
4738 I1: 0 16 1 17 2 18 3 19
4739 I2: 4 20 5 21 6 22 7 23
4740 I3: 8 24 9 25 10 26 11 27
4741 I4: 12 28 13 29 14 30 15 31
4743 The output of the second stage, i.e. the final result is:
4745 I1: 0 8 16 24 1 9 17 25
4746 I2: 2 10 18 26 3 11 19 27
4747 I3: 4 12 20 28 5 13 21 30
4748 I4: 6 14 22 30 7 15 23 31. */
4750 void
4751 vect_permute_store_chain (vec<tree> dr_chain,
4752 unsigned int length,
4753 gimple *stmt,
4754 gimple_stmt_iterator *gsi,
4755 vec<tree> *result_chain)
4757 tree vect1, vect2, high, low;
4758 gimple *perm_stmt;
4759 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4760 tree perm_mask_low, perm_mask_high;
4761 tree data_ref;
4762 tree perm3_mask_low, perm3_mask_high;
4763 unsigned int i, n, log_length = exact_log2 (length);
4764 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4765 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4767 result_chain->quick_grow (length);
4768 memcpy (result_chain->address (), dr_chain.address (),
4769 length * sizeof (tree));
4771 if (length == 3)
4773 unsigned int j0 = 0, j1 = 0, j2 = 0;
4775 for (j = 0; j < 3; j++)
4777 int nelt0 = ((3 - j) * nelt) % 3;
4778 int nelt1 = ((3 - j) * nelt + 1) % 3;
4779 int nelt2 = ((3 - j) * nelt + 2) % 3;
4781 for (i = 0; i < nelt; i++)
4783 if (3 * i + nelt0 < nelt)
4784 sel[3 * i + nelt0] = j0++;
4785 if (3 * i + nelt1 < nelt)
4786 sel[3 * i + nelt1] = nelt + j1++;
4787 if (3 * i + nelt2 < nelt)
4788 sel[3 * i + nelt2] = 0;
4790 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4792 for (i = 0; i < nelt; i++)
4794 if (3 * i + nelt0 < nelt)
4795 sel[3 * i + nelt0] = 3 * i + nelt0;
4796 if (3 * i + nelt1 < nelt)
4797 sel[3 * i + nelt1] = 3 * i + nelt1;
4798 if (3 * i + nelt2 < nelt)
4799 sel[3 * i + nelt2] = nelt + j2++;
4801 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4803 vect1 = dr_chain[0];
4804 vect2 = dr_chain[1];
4806 /* Create interleaving stmt:
4807 low = VEC_PERM_EXPR <vect1, vect2,
4808 {j, nelt, *, j + 1, nelt + j + 1, *,
4809 j + 2, nelt + j + 2, *, ...}> */
4810 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4811 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4812 vect2, perm3_mask_low);
4813 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4815 vect1 = data_ref;
4816 vect2 = dr_chain[2];
4817 /* Create interleaving stmt:
4818 low = VEC_PERM_EXPR <vect1, vect2,
4819 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4820 6, 7, nelt + j + 2, ...}> */
4821 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4822 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4823 vect2, perm3_mask_high);
4824 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4825 (*result_chain)[j] = data_ref;
4828 else
4830 /* If length is not equal to 3 then only power of 2 is supported. */
4831 gcc_assert (exact_log2 (length) != -1);
4833 for (i = 0, n = nelt / 2; i < n; i++)
4835 sel[i * 2] = i;
4836 sel[i * 2 + 1] = i + nelt;
4838 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4840 for (i = 0; i < nelt; i++)
4841 sel[i] += nelt / 2;
4842 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4844 for (i = 0, n = log_length; i < n; i++)
4846 for (j = 0; j < length/2; j++)
4848 vect1 = dr_chain[j];
4849 vect2 = dr_chain[j+length/2];
4851 /* Create interleaving stmt:
4852 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4853 ...}> */
4854 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4855 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4856 vect2, perm_mask_high);
4857 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4858 (*result_chain)[2*j] = high;
4860 /* Create interleaving stmt:
4861 low = VEC_PERM_EXPR <vect1, vect2,
4862 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4863 ...}> */
4864 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4865 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4866 vect2, perm_mask_low);
4867 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4868 (*result_chain)[2*j+1] = low;
4870 memcpy (dr_chain.address (), result_chain->address (),
4871 length * sizeof (tree));
4876 /* Function vect_setup_realignment
4878 This function is called when vectorizing an unaligned load using
4879 the dr_explicit_realign[_optimized] scheme.
4880 This function generates the following code at the loop prolog:
4882 p = initial_addr;
4883 x msq_init = *(floor(p)); # prolog load
4884 realignment_token = call target_builtin;
4885 loop:
4886 x msq = phi (msq_init, ---)
4888 The stmts marked with x are generated only for the case of
4889 dr_explicit_realign_optimized.
4891 The code above sets up a new (vector) pointer, pointing to the first
4892 location accessed by STMT, and a "floor-aligned" load using that pointer.
4893 It also generates code to compute the "realignment-token" (if the relevant
4894 target hook was defined), and creates a phi-node at the loop-header bb
4895 whose arguments are the result of the prolog-load (created by this
4896 function) and the result of a load that takes place in the loop (to be
4897 created by the caller to this function).
4899 For the case of dr_explicit_realign_optimized:
4900 The caller to this function uses the phi-result (msq) to create the
4901 realignment code inside the loop, and sets up the missing phi argument,
4902 as follows:
4903 loop:
4904 msq = phi (msq_init, lsq)
4905 lsq = *(floor(p')); # load in loop
4906 result = realign_load (msq, lsq, realignment_token);
4908 For the case of dr_explicit_realign:
4909 loop:
4910 msq = *(floor(p)); # load in loop
4911 p' = p + (VS-1);
4912 lsq = *(floor(p')); # load in loop
4913 result = realign_load (msq, lsq, realignment_token);
4915 Input:
4916 STMT - (scalar) load stmt to be vectorized. This load accesses
4917 a memory location that may be unaligned.
4918 BSI - place where new code is to be inserted.
4919 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4920 is used.
4922 Output:
4923 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4924 target hook, if defined.
4925 Return value - the result of the loop-header phi node. */
4927 tree
4928 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4929 tree *realignment_token,
4930 enum dr_alignment_support alignment_support_scheme,
4931 tree init_addr,
4932 struct loop **at_loop)
4934 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4935 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4936 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4937 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4938 struct loop *loop = NULL;
4939 edge pe = NULL;
4940 tree scalar_dest = gimple_assign_lhs (stmt);
4941 tree vec_dest;
4942 gimple *inc;
4943 tree ptr;
4944 tree data_ref;
4945 basic_block new_bb;
4946 tree msq_init = NULL_TREE;
4947 tree new_temp;
4948 gphi *phi_stmt;
4949 tree msq = NULL_TREE;
4950 gimple_seq stmts = NULL;
4951 bool inv_p;
4952 bool compute_in_loop = false;
4953 bool nested_in_vect_loop = false;
4954 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4955 struct loop *loop_for_initial_load = NULL;
4957 if (loop_vinfo)
4959 loop = LOOP_VINFO_LOOP (loop_vinfo);
4960 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4963 gcc_assert (alignment_support_scheme == dr_explicit_realign
4964 || alignment_support_scheme == dr_explicit_realign_optimized);
4966 /* We need to generate three things:
4967 1. the misalignment computation
4968 2. the extra vector load (for the optimized realignment scheme).
4969 3. the phi node for the two vectors from which the realignment is
4970 done (for the optimized realignment scheme). */
4972 /* 1. Determine where to generate the misalignment computation.
4974 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4975 calculation will be generated by this function, outside the loop (in the
4976 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4977 caller, inside the loop.
4979 Background: If the misalignment remains fixed throughout the iterations of
4980 the loop, then both realignment schemes are applicable, and also the
4981 misalignment computation can be done outside LOOP. This is because we are
4982 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4983 are a multiple of VS (the Vector Size), and therefore the misalignment in
4984 different vectorized LOOP iterations is always the same.
4985 The problem arises only if the memory access is in an inner-loop nested
4986 inside LOOP, which is now being vectorized using outer-loop vectorization.
4987 This is the only case when the misalignment of the memory access may not
4988 remain fixed throughout the iterations of the inner-loop (as explained in
4989 detail in vect_supportable_dr_alignment). In this case, not only is the
4990 optimized realignment scheme not applicable, but also the misalignment
4991 computation (and generation of the realignment token that is passed to
4992 REALIGN_LOAD) have to be done inside the loop.
4994 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4995 or not, which in turn determines if the misalignment is computed inside
4996 the inner-loop, or outside LOOP. */
4998 if (init_addr != NULL_TREE || !loop_vinfo)
5000 compute_in_loop = true;
5001 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5005 /* 2. Determine where to generate the extra vector load.
5007 For the optimized realignment scheme, instead of generating two vector
5008 loads in each iteration, we generate a single extra vector load in the
5009 preheader of the loop, and in each iteration reuse the result of the
5010 vector load from the previous iteration. In case the memory access is in
5011 an inner-loop nested inside LOOP, which is now being vectorized using
5012 outer-loop vectorization, we need to determine whether this initial vector
5013 load should be generated at the preheader of the inner-loop, or can be
5014 generated at the preheader of LOOP. If the memory access has no evolution
5015 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5016 to be generated inside LOOP (in the preheader of the inner-loop). */
5018 if (nested_in_vect_loop)
5020 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5021 bool invariant_in_outerloop =
5022 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5023 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5025 else
5026 loop_for_initial_load = loop;
5027 if (at_loop)
5028 *at_loop = loop_for_initial_load;
5030 if (loop_for_initial_load)
5031 pe = loop_preheader_edge (loop_for_initial_load);
5033 /* 3. For the case of the optimized realignment, create the first vector
5034 load at the loop preheader. */
5036 if (alignment_support_scheme == dr_explicit_realign_optimized)
5038 /* Create msq_init = *(floor(p1)) in the loop preheader */
5039 gassign *new_stmt;
5041 gcc_assert (!compute_in_loop);
5042 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5043 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5044 NULL_TREE, &init_addr, NULL, &inc,
5045 true, &inv_p);
5046 if (TREE_CODE (ptr) == SSA_NAME)
5047 new_temp = copy_ssa_name (ptr);
5048 else
5049 new_temp = make_ssa_name (TREE_TYPE (ptr));
5050 new_stmt = gimple_build_assign
5051 (new_temp, BIT_AND_EXPR, ptr,
5052 build_int_cst (TREE_TYPE (ptr),
5053 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5054 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5055 gcc_assert (!new_bb);
5056 data_ref
5057 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5058 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5059 new_stmt = gimple_build_assign (vec_dest, data_ref);
5060 new_temp = make_ssa_name (vec_dest, new_stmt);
5061 gimple_assign_set_lhs (new_stmt, new_temp);
5062 if (pe)
5064 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5065 gcc_assert (!new_bb);
5067 else
5068 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5070 msq_init = gimple_assign_lhs (new_stmt);
5073 /* 4. Create realignment token using a target builtin, if available.
5074 It is done either inside the containing loop, or before LOOP (as
5075 determined above). */
5077 if (targetm.vectorize.builtin_mask_for_load)
5079 gcall *new_stmt;
5080 tree builtin_decl;
5082 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5083 if (!init_addr)
5085 /* Generate the INIT_ADDR computation outside LOOP. */
5086 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5087 NULL_TREE, loop);
5088 if (loop)
5090 pe = loop_preheader_edge (loop);
5091 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5092 gcc_assert (!new_bb);
5094 else
5095 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5098 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5099 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5100 vec_dest =
5101 vect_create_destination_var (scalar_dest,
5102 gimple_call_return_type (new_stmt));
5103 new_temp = make_ssa_name (vec_dest, new_stmt);
5104 gimple_call_set_lhs (new_stmt, new_temp);
5106 if (compute_in_loop)
5107 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5108 else
5110 /* Generate the misalignment computation outside LOOP. */
5111 pe = loop_preheader_edge (loop);
5112 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5113 gcc_assert (!new_bb);
5116 *realignment_token = gimple_call_lhs (new_stmt);
5118 /* The result of the CALL_EXPR to this builtin is determined from
5119 the value of the parameter and no global variables are touched
5120 which makes the builtin a "const" function. Requiring the
5121 builtin to have the "const" attribute makes it unnecessary
5122 to call mark_call_clobbered. */
5123 gcc_assert (TREE_READONLY (builtin_decl));
5126 if (alignment_support_scheme == dr_explicit_realign)
5127 return msq;
5129 gcc_assert (!compute_in_loop);
5130 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5133 /* 5. Create msq = phi <msq_init, lsq> in loop */
5135 pe = loop_preheader_edge (containing_loop);
5136 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5137 msq = make_ssa_name (vec_dest);
5138 phi_stmt = create_phi_node (msq, containing_loop->header);
5139 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5141 return msq;
5145 /* Function vect_grouped_load_supported.
5147 Returns TRUE if even and odd permutations are supported,
5148 and FALSE otherwise. */
5150 bool
5151 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5153 machine_mode mode = TYPE_MODE (vectype);
5155 /* vect_permute_load_chain requires the group size to be equal to 3 or
5156 be a power of two. */
5157 if (count != 3 && exact_log2 (count) == -1)
5159 if (dump_enabled_p ())
5160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5161 "the size of the group of accesses"
5162 " is not a power of 2 or not equal to 3\n");
5163 return false;
5166 /* Check that the permutation is supported. */
5167 if (VECTOR_MODE_P (mode))
5169 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5170 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5172 if (count == 3)
5174 unsigned int k;
5175 for (k = 0; k < 3; k++)
5177 for (i = 0; i < nelt; i++)
5178 if (3 * i + k < 2 * nelt)
5179 sel[i] = 3 * i + k;
5180 else
5181 sel[i] = 0;
5182 if (!can_vec_perm_p (mode, false, sel))
5184 if (dump_enabled_p ())
5185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5186 "shuffle of 3 loads is not supported by"
5187 " target\n");
5188 return false;
5190 for (i = 0, j = 0; i < nelt; i++)
5191 if (3 * i + k < 2 * nelt)
5192 sel[i] = i;
5193 else
5194 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5195 if (!can_vec_perm_p (mode, false, sel))
5197 if (dump_enabled_p ())
5198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5199 "shuffle of 3 loads is not supported by"
5200 " target\n");
5201 return false;
5204 return true;
5206 else
5208 /* If length is not equal to 3 then only power of 2 is supported. */
5209 gcc_assert (exact_log2 (count) != -1);
5210 for (i = 0; i < nelt; i++)
5211 sel[i] = i * 2;
5212 if (can_vec_perm_p (mode, false, sel))
5214 for (i = 0; i < nelt; i++)
5215 sel[i] = i * 2 + 1;
5216 if (can_vec_perm_p (mode, false, sel))
5217 return true;
5222 if (dump_enabled_p ())
5223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5224 "extract even/odd not supported by target\n");
5225 return false;
5228 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5229 type VECTYPE. */
5231 bool
5232 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5234 return vect_lanes_optab_supported_p ("vec_load_lanes",
5235 vec_load_lanes_optab,
5236 vectype, count);
5239 /* Function vect_permute_load_chain.
5241 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5242 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5243 the input data correctly. Return the final references for loads in
5244 RESULT_CHAIN.
5246 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5247 The input is 4 vectors each containing 8 elements. We assign a number to each
5248 element, the input sequence is:
5250 1st vec: 0 1 2 3 4 5 6 7
5251 2nd vec: 8 9 10 11 12 13 14 15
5252 3rd vec: 16 17 18 19 20 21 22 23
5253 4th vec: 24 25 26 27 28 29 30 31
5255 The output sequence should be:
5257 1st vec: 0 4 8 12 16 20 24 28
5258 2nd vec: 1 5 9 13 17 21 25 29
5259 3rd vec: 2 6 10 14 18 22 26 30
5260 4th vec: 3 7 11 15 19 23 27 31
5262 i.e., the first output vector should contain the first elements of each
5263 interleaving group, etc.
5265 We use extract_even/odd instructions to create such output. The input of
5266 each extract_even/odd operation is two vectors
5267 1st vec 2nd vec
5268 0 1 2 3 4 5 6 7
5270 and the output is the vector of extracted even/odd elements. The output of
5271 extract_even will be: 0 2 4 6
5272 and of extract_odd: 1 3 5 7
5275 The permutation is done in log LENGTH stages. In each stage extract_even
5276 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5277 their order. In our example,
5279 E1: extract_even (1st vec, 2nd vec)
5280 E2: extract_odd (1st vec, 2nd vec)
5281 E3: extract_even (3rd vec, 4th vec)
5282 E4: extract_odd (3rd vec, 4th vec)
5284 The output for the first stage will be:
5286 E1: 0 2 4 6 8 10 12 14
5287 E2: 1 3 5 7 9 11 13 15
5288 E3: 16 18 20 22 24 26 28 30
5289 E4: 17 19 21 23 25 27 29 31
5291 In order to proceed and create the correct sequence for the next stage (or
5292 for the correct output, if the second stage is the last one, as in our
5293 example), we first put the output of extract_even operation and then the
5294 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5295 The input for the second stage is:
5297 1st vec (E1): 0 2 4 6 8 10 12 14
5298 2nd vec (E3): 16 18 20 22 24 26 28 30
5299 3rd vec (E2): 1 3 5 7 9 11 13 15
5300 4th vec (E4): 17 19 21 23 25 27 29 31
5302 The output of the second stage:
5304 E1: 0 4 8 12 16 20 24 28
5305 E2: 2 6 10 14 18 22 26 30
5306 E3: 1 5 9 13 17 21 25 29
5307 E4: 3 7 11 15 19 23 27 31
5309 And RESULT_CHAIN after reordering:
5311 1st vec (E1): 0 4 8 12 16 20 24 28
5312 2nd vec (E3): 1 5 9 13 17 21 25 29
5313 3rd vec (E2): 2 6 10 14 18 22 26 30
5314 4th vec (E4): 3 7 11 15 19 23 27 31. */
5316 static void
5317 vect_permute_load_chain (vec<tree> dr_chain,
5318 unsigned int length,
5319 gimple *stmt,
5320 gimple_stmt_iterator *gsi,
5321 vec<tree> *result_chain)
5323 tree data_ref, first_vect, second_vect;
5324 tree perm_mask_even, perm_mask_odd;
5325 tree perm3_mask_low, perm3_mask_high;
5326 gimple *perm_stmt;
5327 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5328 unsigned int i, j, log_length = exact_log2 (length);
5329 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5330 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5332 result_chain->quick_grow (length);
5333 memcpy (result_chain->address (), dr_chain.address (),
5334 length * sizeof (tree));
5336 if (length == 3)
5338 unsigned int k;
5340 for (k = 0; k < 3; k++)
5342 for (i = 0; i < nelt; i++)
5343 if (3 * i + k < 2 * nelt)
5344 sel[i] = 3 * i + k;
5345 else
5346 sel[i] = 0;
5347 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5349 for (i = 0, j = 0; i < nelt; i++)
5350 if (3 * i + k < 2 * nelt)
5351 sel[i] = i;
5352 else
5353 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5355 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5357 first_vect = dr_chain[0];
5358 second_vect = dr_chain[1];
5360 /* Create interleaving stmt (low part of):
5361 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5362 ...}> */
5363 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5364 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5365 second_vect, perm3_mask_low);
5366 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5368 /* Create interleaving stmt (high part of):
5369 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5370 ...}> */
5371 first_vect = data_ref;
5372 second_vect = dr_chain[2];
5373 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5374 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5375 second_vect, perm3_mask_high);
5376 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5377 (*result_chain)[k] = data_ref;
5380 else
5382 /* If length is not equal to 3 then only power of 2 is supported. */
5383 gcc_assert (exact_log2 (length) != -1);
5385 for (i = 0; i < nelt; ++i)
5386 sel[i] = i * 2;
5387 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5389 for (i = 0; i < nelt; ++i)
5390 sel[i] = i * 2 + 1;
5391 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5393 for (i = 0; i < log_length; i++)
5395 for (j = 0; j < length; j += 2)
5397 first_vect = dr_chain[j];
5398 second_vect = dr_chain[j+1];
5400 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5401 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5402 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5403 first_vect, second_vect,
5404 perm_mask_even);
5405 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5406 (*result_chain)[j/2] = data_ref;
5408 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5409 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5410 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5411 first_vect, second_vect,
5412 perm_mask_odd);
5413 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5414 (*result_chain)[j/2+length/2] = data_ref;
5416 memcpy (dr_chain.address (), result_chain->address (),
5417 length * sizeof (tree));
5422 /* Function vect_shift_permute_load_chain.
5424 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5425 sequence of stmts to reorder the input data accordingly.
5426 Return the final references for loads in RESULT_CHAIN.
5427 Return true if successed, false otherwise.
5429 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5430 The input is 3 vectors each containing 8 elements. We assign a
5431 number to each element, the input sequence is:
5433 1st vec: 0 1 2 3 4 5 6 7
5434 2nd vec: 8 9 10 11 12 13 14 15
5435 3rd vec: 16 17 18 19 20 21 22 23
5437 The output sequence should be:
5439 1st vec: 0 3 6 9 12 15 18 21
5440 2nd vec: 1 4 7 10 13 16 19 22
5441 3rd vec: 2 5 8 11 14 17 20 23
5443 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5445 First we shuffle all 3 vectors to get correct elements order:
5447 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5448 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5449 3rd vec: (16 19 22) (17 20 23) (18 21)
5451 Next we unite and shift vector 3 times:
5453 1st step:
5454 shift right by 6 the concatenation of:
5455 "1st vec" and "2nd vec"
5456 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5457 "2nd vec" and "3rd vec"
5458 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5459 "3rd vec" and "1st vec"
5460 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5461 | New vectors |
5463 So that now new vectors are:
5465 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5466 2nd vec: (10 13) (16 19 22) (17 20 23)
5467 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5469 2nd step:
5470 shift right by 5 the concatenation of:
5471 "1st vec" and "3rd vec"
5472 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5473 "2nd vec" and "1st vec"
5474 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5475 "3rd vec" and "2nd vec"
5476 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5477 | New vectors |
5479 So that now new vectors are:
5481 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5482 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5483 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5485 3rd step:
5486 shift right by 5 the concatenation of:
5487 "1st vec" and "1st vec"
5488 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5489 shift right by 3 the concatenation of:
5490 "2nd vec" and "2nd vec"
5491 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5492 | New vectors |
5494 So that now all vectors are READY:
5495 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5496 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5497 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5499 This algorithm is faster than one in vect_permute_load_chain if:
5500 1. "shift of a concatination" is faster than general permutation.
5501 This is usually so.
5502 2. The TARGET machine can't execute vector instructions in parallel.
5503 This is because each step of the algorithm depends on previous.
5504 The algorithm in vect_permute_load_chain is much more parallel.
5506 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5509 static bool
5510 vect_shift_permute_load_chain (vec<tree> dr_chain,
5511 unsigned int length,
5512 gimple *stmt,
5513 gimple_stmt_iterator *gsi,
5514 vec<tree> *result_chain)
5516 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5517 tree perm2_mask1, perm2_mask2, perm3_mask;
5518 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5519 gimple *perm_stmt;
5521 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5522 unsigned int i;
5523 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5524 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5525 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5526 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5528 result_chain->quick_grow (length);
5529 memcpy (result_chain->address (), dr_chain.address (),
5530 length * sizeof (tree));
5532 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5534 unsigned int j, log_length = exact_log2 (length);
5535 for (i = 0; i < nelt / 2; ++i)
5536 sel[i] = i * 2;
5537 for (i = 0; i < nelt / 2; ++i)
5538 sel[nelt / 2 + i] = i * 2 + 1;
5539 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5541 if (dump_enabled_p ())
5542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5543 "shuffle of 2 fields structure is not \
5544 supported by target\n");
5545 return false;
5547 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5549 for (i = 0; i < nelt / 2; ++i)
5550 sel[i] = i * 2 + 1;
5551 for (i = 0; i < nelt / 2; ++i)
5552 sel[nelt / 2 + i] = i * 2;
5553 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5555 if (dump_enabled_p ())
5556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5557 "shuffle of 2 fields structure is not \
5558 supported by target\n");
5559 return false;
5561 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5563 /* Generating permutation constant to shift all elements.
5564 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5565 for (i = 0; i < nelt; i++)
5566 sel[i] = nelt / 2 + i;
5567 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5569 if (dump_enabled_p ())
5570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5571 "shift permutation is not supported by target\n");
5572 return false;
5574 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5576 /* Generating permutation constant to select vector from 2.
5577 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5578 for (i = 0; i < nelt / 2; i++)
5579 sel[i] = i;
5580 for (i = nelt / 2; i < nelt; i++)
5581 sel[i] = nelt + i;
5582 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5584 if (dump_enabled_p ())
5585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5586 "select is not supported by target\n");
5587 return false;
5589 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5591 for (i = 0; i < log_length; i++)
5593 for (j = 0; j < length; j += 2)
5595 first_vect = dr_chain[j];
5596 second_vect = dr_chain[j + 1];
5598 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5599 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5600 first_vect, first_vect,
5601 perm2_mask1);
5602 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5603 vect[0] = data_ref;
5605 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5606 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5607 second_vect, second_vect,
5608 perm2_mask2);
5609 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5610 vect[1] = data_ref;
5612 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5613 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5614 vect[0], vect[1], shift1_mask);
5615 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5616 (*result_chain)[j/2 + length/2] = data_ref;
5618 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5619 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5620 vect[0], vect[1], select_mask);
5621 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5622 (*result_chain)[j/2] = data_ref;
5624 memcpy (dr_chain.address (), result_chain->address (),
5625 length * sizeof (tree));
5627 return true;
5629 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5631 unsigned int k = 0, l = 0;
5633 /* Generating permutation constant to get all elements in rigth order.
5634 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5635 for (i = 0; i < nelt; i++)
5637 if (3 * k + (l % 3) >= nelt)
5639 k = 0;
5640 l += (3 - (nelt % 3));
5642 sel[i] = 3 * k + (l % 3);
5643 k++;
5645 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5647 if (dump_enabled_p ())
5648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649 "shuffle of 3 fields structure is not \
5650 supported by target\n");
5651 return false;
5653 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5655 /* Generating permutation constant to shift all elements.
5656 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5657 for (i = 0; i < nelt; i++)
5658 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5659 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5661 if (dump_enabled_p ())
5662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5663 "shift permutation is not supported by target\n");
5664 return false;
5666 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5668 /* Generating permutation constant to shift all elements.
5669 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5670 for (i = 0; i < nelt; i++)
5671 sel[i] = 2 * (nelt / 3) + 1 + i;
5672 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5674 if (dump_enabled_p ())
5675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5676 "shift permutation is not supported by target\n");
5677 return false;
5679 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5681 /* Generating permutation constant to shift all elements.
5682 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5683 for (i = 0; i < nelt; i++)
5684 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5685 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5687 if (dump_enabled_p ())
5688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5689 "shift permutation is not supported by target\n");
5690 return false;
5692 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5694 /* Generating permutation constant to shift all elements.
5695 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5696 for (i = 0; i < nelt; i++)
5697 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5698 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702 "shift permutation is not supported by target\n");
5703 return false;
5705 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5707 for (k = 0; k < 3; k++)
5709 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5710 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5711 dr_chain[k], dr_chain[k],
5712 perm3_mask);
5713 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5714 vect[k] = data_ref;
5717 for (k = 0; k < 3; k++)
5719 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5720 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5721 vect[k % 3], vect[(k + 1) % 3],
5722 shift1_mask);
5723 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5724 vect_shift[k] = data_ref;
5727 for (k = 0; k < 3; k++)
5729 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5730 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5731 vect_shift[(4 - k) % 3],
5732 vect_shift[(3 - k) % 3],
5733 shift2_mask);
5734 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5735 vect[k] = data_ref;
5738 (*result_chain)[3 - (nelt % 3)] = vect[2];
5740 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5741 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5742 vect[0], shift3_mask);
5743 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5744 (*result_chain)[nelt % 3] = data_ref;
5746 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5747 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5748 vect[1], shift4_mask);
5749 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5750 (*result_chain)[0] = data_ref;
5751 return true;
5753 return false;
5756 /* Function vect_transform_grouped_load.
5758 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5759 to perform their permutation and ascribe the result vectorized statements to
5760 the scalar statements.
5763 void
5764 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5765 gimple_stmt_iterator *gsi)
5767 machine_mode mode;
5768 vec<tree> result_chain = vNULL;
5770 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5771 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5772 vectors, that are ready for vector computation. */
5773 result_chain.create (size);
5775 /* If reassociation width for vector type is 2 or greater target machine can
5776 execute 2 or more vector instructions in parallel. Otherwise try to
5777 get chain for loads group using vect_shift_permute_load_chain. */
5778 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5779 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5780 || exact_log2 (size) != -1
5781 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5782 gsi, &result_chain))
5783 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5784 vect_record_grouped_load_vectors (stmt, result_chain);
5785 result_chain.release ();
5788 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5789 generated as part of the vectorization of STMT. Assign the statement
5790 for each vector to the associated scalar statement. */
5792 void
5793 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5795 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5796 gimple *next_stmt, *new_stmt;
5797 unsigned int i, gap_count;
5798 tree tmp_data_ref;
5800 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5801 Since we scan the chain starting from it's first node, their order
5802 corresponds the order of data-refs in RESULT_CHAIN. */
5803 next_stmt = first_stmt;
5804 gap_count = 1;
5805 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5807 if (!next_stmt)
5808 break;
5810 /* Skip the gaps. Loads created for the gaps will be removed by dead
5811 code elimination pass later. No need to check for the first stmt in
5812 the group, since it always exists.
5813 GROUP_GAP is the number of steps in elements from the previous
5814 access (if there is no gap GROUP_GAP is 1). We skip loads that
5815 correspond to the gaps. */
5816 if (next_stmt != first_stmt
5817 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5819 gap_count++;
5820 continue;
5823 while (next_stmt)
5825 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5826 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5827 copies, and we put the new vector statement in the first available
5828 RELATED_STMT. */
5829 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5830 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5831 else
5833 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5835 gimple *prev_stmt =
5836 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5837 gimple *rel_stmt =
5838 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5839 while (rel_stmt)
5841 prev_stmt = rel_stmt;
5842 rel_stmt =
5843 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5846 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5847 new_stmt;
5851 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5852 gap_count = 1;
5853 /* If NEXT_STMT accesses the same DR as the previous statement,
5854 put the same TMP_DATA_REF as its vectorized statement; otherwise
5855 get the next data-ref from RESULT_CHAIN. */
5856 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5857 break;
5862 /* Function vect_force_dr_alignment_p.
5864 Returns whether the alignment of a DECL can be forced to be aligned
5865 on ALIGNMENT bit boundary. */
5867 bool
5868 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5870 if (TREE_CODE (decl) != VAR_DECL)
5871 return false;
5873 if (decl_in_symtab_p (decl)
5874 && !symtab_node::get (decl)->can_increase_alignment_p ())
5875 return false;
5877 if (TREE_STATIC (decl))
5878 return (alignment <= MAX_OFILE_ALIGNMENT);
5879 else
5880 return (alignment <= MAX_STACK_ALIGNMENT);
5884 /* Return whether the data reference DR is supported with respect to its
5885 alignment.
5886 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5887 it is aligned, i.e., check if it is possible to vectorize it with different
5888 alignment. */
5890 enum dr_alignment_support
5891 vect_supportable_dr_alignment (struct data_reference *dr,
5892 bool check_aligned_accesses)
5894 gimple *stmt = DR_STMT (dr);
5895 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5896 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5897 machine_mode mode = TYPE_MODE (vectype);
5898 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5899 struct loop *vect_loop = NULL;
5900 bool nested_in_vect_loop = false;
5902 if (aligned_access_p (dr) && !check_aligned_accesses)
5903 return dr_aligned;
5905 /* For now assume all conditional loads/stores support unaligned
5906 access without any special code. */
5907 if (is_gimple_call (stmt)
5908 && gimple_call_internal_p (stmt)
5909 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5910 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5911 return dr_unaligned_supported;
5913 if (loop_vinfo)
5915 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5916 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5919 /* Possibly unaligned access. */
5921 /* We can choose between using the implicit realignment scheme (generating
5922 a misaligned_move stmt) and the explicit realignment scheme (generating
5923 aligned loads with a REALIGN_LOAD). There are two variants to the
5924 explicit realignment scheme: optimized, and unoptimized.
5925 We can optimize the realignment only if the step between consecutive
5926 vector loads is equal to the vector size. Since the vector memory
5927 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5928 is guaranteed that the misalignment amount remains the same throughout the
5929 execution of the vectorized loop. Therefore, we can create the
5930 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5931 at the loop preheader.
5933 However, in the case of outer-loop vectorization, when vectorizing a
5934 memory access in the inner-loop nested within the LOOP that is now being
5935 vectorized, while it is guaranteed that the misalignment of the
5936 vectorized memory access will remain the same in different outer-loop
5937 iterations, it is *not* guaranteed that is will remain the same throughout
5938 the execution of the inner-loop. This is because the inner-loop advances
5939 with the original scalar step (and not in steps of VS). If the inner-loop
5940 step happens to be a multiple of VS, then the misalignment remains fixed
5941 and we can use the optimized realignment scheme. For example:
5943 for (i=0; i<N; i++)
5944 for (j=0; j<M; j++)
5945 s += a[i+j];
5947 When vectorizing the i-loop in the above example, the step between
5948 consecutive vector loads is 1, and so the misalignment does not remain
5949 fixed across the execution of the inner-loop, and the realignment cannot
5950 be optimized (as illustrated in the following pseudo vectorized loop):
5952 for (i=0; i<N; i+=4)
5953 for (j=0; j<M; j++){
5954 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5955 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5956 // (assuming that we start from an aligned address).
5959 We therefore have to use the unoptimized realignment scheme:
5961 for (i=0; i<N; i+=4)
5962 for (j=k; j<M; j+=4)
5963 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5964 // that the misalignment of the initial address is
5965 // 0).
5967 The loop can then be vectorized as follows:
5969 for (k=0; k<4; k++){
5970 rt = get_realignment_token (&vp[k]);
5971 for (i=0; i<N; i+=4){
5972 v1 = vp[i+k];
5973 for (j=k; j<M; j+=4){
5974 v2 = vp[i+j+VS-1];
5975 va = REALIGN_LOAD <v1,v2,rt>;
5976 vs += va;
5977 v1 = v2;
5980 } */
5982 if (DR_IS_READ (dr))
5984 bool is_packed = false;
5985 tree type = (TREE_TYPE (DR_REF (dr)));
5987 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5988 && (!targetm.vectorize.builtin_mask_for_load
5989 || targetm.vectorize.builtin_mask_for_load ()))
5991 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5993 /* If we are doing SLP then the accesses need not have the
5994 same alignment, instead it depends on the SLP group size. */
5995 if (loop_vinfo
5996 && STMT_SLP_TYPE (stmt_info)
5997 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5998 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5999 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6001 else if (!loop_vinfo
6002 || (nested_in_vect_loop
6003 && (TREE_INT_CST_LOW (DR_STEP (dr))
6004 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6005 return dr_explicit_realign;
6006 else
6007 return dr_explicit_realign_optimized;
6009 if (!known_alignment_for_access_p (dr))
6010 is_packed = not_size_aligned (DR_REF (dr));
6012 if ((TYPE_USER_ALIGN (type) && !is_packed)
6013 || targetm.vectorize.
6014 support_vector_misalignment (mode, type,
6015 DR_MISALIGNMENT (dr), is_packed))
6016 /* Can't software pipeline the loads, but can at least do them. */
6017 return dr_unaligned_supported;
6019 else
6021 bool is_packed = false;
6022 tree type = (TREE_TYPE (DR_REF (dr)));
6024 if (!known_alignment_for_access_p (dr))
6025 is_packed = not_size_aligned (DR_REF (dr));
6027 if ((TYPE_USER_ALIGN (type) && !is_packed)
6028 || targetm.vectorize.
6029 support_vector_misalignment (mode, type,
6030 DR_MISALIGNMENT (dr), is_packed))
6031 return dr_unaligned_supported;
6034 /* Unsupported. */
6035 return dr_unaligned_unsupported;