2017-05-25 Paolo Carlini <paolo.carlini@oracle.com>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob67cc969757df2d0a23f6b716704ff8370086763b
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE, vect_location,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158 dump_printf (MSG_NOTE, " and ");
159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160 dump_printf (MSG_NOTE, "\n");
163 if (optimize_loop_nest_for_size_p (loop))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167 "versioning not supported when optimizing"
168 " for size.\n");
169 return false;
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
173 if (loop->inner)
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177 "versioning not yet supported for outer-loops.\n");
178 return false;
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 step. */
183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188 "versioning not yet supported for non-constant "
189 "step\n");
190 return false;
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194 return true;
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
205 static bool
206 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
207 loop_vec_info loop_vinfo, int *max_vf)
209 unsigned int i;
210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211 struct data_reference *dra = DDR_A (ddr);
212 struct data_reference *drb = DDR_B (ddr);
213 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
214 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
215 lambda_vector dist_v;
216 unsigned int loop_depth;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
221 gcc_unreachable ();
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
225 return false;
227 if (dra == drb
228 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
229 return false;
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
235 return false;
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
244 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
246 get_alias_set (DR_REF (drb))))
247 return false;
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop->safelen >= 2)
256 if (loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 return false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
271 DR_REF (dra));
272 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (drb));
275 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
277 return true;
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (dra));
287 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (drb));
290 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop->safelen >= 2)
304 if (loop->safelen < *max_vf)
305 *max_vf = loop->safelen;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
307 return false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (dra));
320 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
322 DR_REF (drb));
323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
336 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
342 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
345 int dist = dist_v[loop_depth];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance = %d.\n", dist);
351 if (dist == 0)
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_NOTE, " and ");
359 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
366 .. = a[i];
367 .. = a[i+1];
368 a[i] = ..;
369 a[i+1] = ..;
370 *p = ..;
371 .. = a[i];
372 .. = a[i+1];
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
377 a[i] = ...;
378 ... = a[i];
379 a[i+1] = ...;
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
384 gimple *earlier_stmt;
385 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
386 if (DR_IS_WRITE
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "READ_WRITE dependence in interleaving."
392 "\n");
393 return true;
397 continue;
400 if (dist > 0 && DDR_REVERSED_P (ddr))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
411 if (DR_IS_READ (drb)
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
415 continue;
418 if (abs (dist) >= 2
419 && abs (dist) < *max_vf)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf = abs (dist);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "adjusting maximal vectorization factor to %i\n",
427 *max_vf);
430 if (abs (dist) >= *max_vf)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "dependence distance >= VF.\n");
437 continue;
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
446 dump_printf (MSG_NOTE, " and ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
448 dump_printf (MSG_NOTE, "\n");
451 return true;
454 return false;
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
463 bool
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
466 unsigned int i;
467 struct data_dependence_relation *ddr;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
479 &LOOP_VINFO_DDRS (loop_vinfo),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
481 return false;
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
487 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
488 else
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
490 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
491 return false;
493 return true;
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
504 static bool
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
507 struct data_reference *dra = DDR_A (ddr);
508 struct data_reference *drb = DDR_B (ddr);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
515 return false;
517 if (dra == drb)
518 return false;
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra) && DR_IS_READ (drb))
522 return false;
524 /* If dra and drb are part of the same interleaving chain consider
525 them independent. */
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
529 return false;
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE, vect_location,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
549 dump_printf (MSG_NOTE, " and ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
551 dump_printf (MSG_NOTE, "\n");
554 return true;
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
562 static bool
563 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
564 vec<gimple *> stores, gimple *last_store)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
568 group. */
569 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
570 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
572 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
573 if (access == last_access)
574 continue;
575 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
576 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
577 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
579 gimple *stmt = gsi_stmt (gsi);
580 if (! gimple_vuse (stmt)
581 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
582 continue;
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
589 if (!dr_b)
590 return false;
592 bool dependent = false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt))
599 if (stmt != last_store)
600 continue;
601 unsigned i;
602 gimple *store;
603 FOR_EACH_VEC_ELT (stores, i, store)
605 data_reference *store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
607 ddr_p ddr = initialize_data_dependence_relation
608 (dr_a, store_dr, vNULL);
609 dependent = vect_slp_analyze_data_ref_dependence (ddr);
610 free_dependence_relation (ddr);
611 if (dependent)
612 break;
615 else
617 ddr_p ddr = initialize_data_dependence_relation (dr_a,
618 dr_b, vNULL);
619 dependent = vect_slp_analyze_data_ref_dependence (ddr);
620 free_dependence_relation (ddr);
622 if (dependent)
623 return false;
626 return true;
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
636 bool
637 vect_slp_analyze_instance_dependence (slp_instance instance)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store = SLP_INSTANCE_TREE (instance);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
646 store = NULL;
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple *last_store = NULL;
650 if (store)
652 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
653 return false;
655 /* Mark stores in this instance and remember the last one. */
656 last_store = vect_find_last_scalar_stmt_in_slp (store);
657 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
661 bool res = true;
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
665 slp_tree load;
666 unsigned int i;
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
668 if (! vect_slp_analyze_node_dependences (instance, load,
669 store
670 ? SLP_TREE_SCALAR_STMTS (store)
671 : vNULL, last_store))
673 res = false;
674 break;
677 /* Unset the visited flag. */
678 if (store)
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
682 return res;
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
689 Output:
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
697 bool
698 vect_compute_data_ref_alignment (struct data_reference *dr)
700 gimple *stmt = DR_STMT (dr);
701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
702 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
703 struct loop *loop = NULL;
704 tree ref = DR_REF (dr);
705 tree vectype;
706 tree base, base_addr;
707 tree misalign = NULL_TREE;
708 tree aligned_to;
709 tree step;
710 unsigned HOST_WIDE_INT alignment;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "vect_compute_data_ref_alignment:\n");
716 if (loop_vinfo)
717 loop = LOOP_VINFO_LOOP (loop_vinfo);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr)))
723 misalign = DR_INIT (dr);
724 aligned_to = DR_ALIGNED_TO (dr);
725 base_addr = DR_BASE_ADDRESS (dr);
726 vectype = STMT_VINFO_VECTYPE (stmt_info);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop && nested_in_vect_loop_p (loop, stmt))
736 tree step = DR_STEP (dr);
738 if (tree_fits_shwi_p (step)
739 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location,
743 "inner step divides the vector-size.\n");
744 misalign = STMT_VINFO_DR_INIT (stmt_info);
745 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
746 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
748 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "inner step doesn't divide the vector-size.\n");
753 misalign = NULL_TREE;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
761 else
763 tree step = DR_STEP (dr);
764 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
766 if (tree_fits_shwi_p (step)
767 && ((tree_to_shwi (step) * vf)
768 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "step doesn't divide the vector-size.\n");
773 misalign = NULL_TREE;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
779 base = ref;
780 while (handled_component_p (base))
781 base = TREE_OPERAND (base, 0);
782 unsigned int base_alignment = 0;
783 unsigned HOST_WIDE_INT base_bitpos;
784 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
788 DR_BASE_ADDRESS. */
789 if (TREE_CODE (base) == MEM_REF)
791 /* Note all this only works if DR_BASE_ADDRESS is the same as
792 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
793 in other offsets. We need to rework DR to compute the alingment
794 of DR_BASE_ADDRESS as long as all information is still available. */
795 if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0))
797 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
798 base_bitpos &= (base_alignment - 1);
800 else
801 base_bitpos = BITS_PER_UNIT;
803 if (base_bitpos != 0)
804 base_alignment = base_bitpos & -base_bitpos;
805 /* Also look at the alignment of the base address DR analysis
806 computed. */
807 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
808 if (base_addr_alignment > base_alignment)
809 base_alignment = base_addr_alignment;
811 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
812 DR_VECT_AUX (dr)->base_element_aligned = true;
814 alignment = TYPE_ALIGN_UNIT (vectype);
816 if ((compare_tree_int (aligned_to, alignment) < 0)
817 || !misalign)
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Unknown alignment for access: ");
823 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
824 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
826 return true;
829 if (base_alignment < TYPE_ALIGN (vectype))
831 base = base_addr;
832 if (TREE_CODE (base) == ADDR_EXPR)
833 base = TREE_OPERAND (base, 0);
834 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
836 if (dump_enabled_p ())
838 dump_printf_loc (MSG_NOTE, vect_location,
839 "can't force alignment of ref: ");
840 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
841 dump_printf (MSG_NOTE, "\n");
843 return true;
846 if (DECL_USER_ALIGN (base))
848 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "not forcing alignment of user-aligned "
852 "variable: ");
853 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
854 dump_printf (MSG_NOTE, "\n");
856 return true;
859 /* Force the alignment of the decl.
860 NOTE: This is the only change to the code we make during
861 the analysis phase, before deciding to vectorize the loop. */
862 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
865 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
866 dump_printf (MSG_NOTE, "\n");
869 DR_VECT_AUX (dr)->base_decl = base;
870 DR_VECT_AUX (dr)->base_misaligned = true;
871 DR_VECT_AUX (dr)->base_element_aligned = true;
874 if (loop && nested_in_vect_loop_p (loop, stmt))
875 step = STMT_VINFO_DR_STEP (stmt_info);
876 else
877 step = DR_STEP (dr);
878 /* If this is a backward running DR then first access in the larger
879 vectype actually is N-1 elements before the address in the DR.
880 Adjust misalign accordingly. */
881 if (tree_int_cst_sgn (step) < 0)
883 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
884 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
885 otherwise we wouldn't be here. */
886 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
887 /* PLUS because STEP was negative. */
888 misalign = size_binop (PLUS_EXPR, misalign, offset);
891 SET_DR_MISALIGNMENT (dr,
892 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
894 if (dump_enabled_p ())
896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
897 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
898 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
899 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
902 return true;
906 /* Function vect_update_misalignment_for_peel
908 DR - the data reference whose misalignment is to be adjusted.
909 DR_PEEL - the data reference whose misalignment is being made
910 zero in the vector loop by the peel.
911 NPEEL - the number of iterations in the peel loop if the misalignment
912 of DR_PEEL is known at compile time. */
914 static void
915 vect_update_misalignment_for_peel (struct data_reference *dr,
916 struct data_reference *dr_peel, int npeel)
918 unsigned int i;
919 vec<dr_p> same_align_drs;
920 struct data_reference *current_dr;
921 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
922 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
923 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
924 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
926 /* For interleaved data accesses the step in the loop must be multiplied by
927 the size of the interleaving group. */
928 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
929 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
930 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
931 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
933 /* It can be assumed that the data refs with the same alignment as dr_peel
934 are aligned in the vector loop. */
935 same_align_drs
936 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
937 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
939 if (current_dr != dr)
940 continue;
941 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
942 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
943 SET_DR_MISALIGNMENT (dr, 0);
944 return;
947 if (known_alignment_for_access_p (dr)
948 && known_alignment_for_access_p (dr_peel))
950 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
951 int misal = DR_MISALIGNMENT (dr);
952 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
953 misal += negative ? -npeel * dr_size : npeel * dr_size;
954 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
955 SET_DR_MISALIGNMENT (dr, misal);
956 return;
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
961 SET_DR_MISALIGNMENT (dr, -1);
965 /* Function verify_data_ref_alignment
967 Return TRUE if DR can be handled with respect to alignment. */
969 static bool
970 verify_data_ref_alignment (data_reference_p dr)
972 enum dr_alignment_support supportable_dr_alignment
973 = vect_supportable_dr_alignment (dr, false);
974 if (!supportable_dr_alignment)
976 if (dump_enabled_p ())
978 if (DR_IS_READ (dr))
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "not vectorized: unsupported unaligned load.");
981 else
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "not vectorized: unsupported unaligned "
984 "store.");
986 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
987 DR_REF (dr));
988 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
990 return false;
993 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
994 dump_printf_loc (MSG_NOTE, vect_location,
995 "Vectorizing an unaligned access.\n");
997 return true;
1000 /* Function vect_verify_datarefs_alignment
1002 Return TRUE if all data references in the loop can be
1003 handled with respect to alignment. */
1005 bool
1006 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1008 vec<data_reference_p> datarefs = vinfo->datarefs;
1009 struct data_reference *dr;
1010 unsigned int i;
1012 FOR_EACH_VEC_ELT (datarefs, i, dr)
1014 gimple *stmt = DR_STMT (dr);
1015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1017 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1018 continue;
1020 /* For interleaving, only the alignment of the first access matters. */
1021 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1022 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1023 continue;
1025 /* Strided accesses perform only component accesses, alignment is
1026 irrelevant for them. */
1027 if (STMT_VINFO_STRIDED_P (stmt_info)
1028 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1029 continue;
1031 if (! verify_data_ref_alignment (dr))
1032 return false;
1035 return true;
1038 /* Given an memory reference EXP return whether its alignment is less
1039 than its size. */
1041 static bool
1042 not_size_aligned (tree exp)
1044 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1045 return true;
1047 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1048 > get_object_alignment (exp));
1051 /* Function vector_alignment_reachable_p
1053 Return true if vector alignment for DR is reachable by peeling
1054 a few loop iterations. Return false otherwise. */
1056 static bool
1057 vector_alignment_reachable_p (struct data_reference *dr)
1059 gimple *stmt = DR_STMT (dr);
1060 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1061 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1063 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1065 /* For interleaved access we peel only if number of iterations in
1066 the prolog loop ({VF - misalignment}), is a multiple of the
1067 number of the interleaved accesses. */
1068 int elem_size, mis_in_elements;
1069 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1071 /* FORNOW: handle only known alignment. */
1072 if (!known_alignment_for_access_p (dr))
1073 return false;
1075 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1076 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1078 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1079 return false;
1082 /* If misalignment is known at the compile time then allow peeling
1083 only if natural alignment is reachable through peeling. */
1084 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1086 HOST_WIDE_INT elmsize =
1087 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1088 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_NOTE, vect_location,
1091 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1092 dump_printf (MSG_NOTE,
1093 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1095 if (DR_MISALIGNMENT (dr) % elmsize)
1097 if (dump_enabled_p ())
1098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1099 "data size does not divide the misalignment.\n");
1100 return false;
1104 if (!known_alignment_for_access_p (dr))
1106 tree type = TREE_TYPE (DR_REF (dr));
1107 bool is_packed = not_size_aligned (DR_REF (dr));
1108 if (dump_enabled_p ())
1109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110 "Unknown misalignment, %snaturally aligned\n",
1111 is_packed ? "not " : "");
1112 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1115 return true;
1119 /* Calculate the cost of the memory access represented by DR. */
1121 static void
1122 vect_get_data_access_cost (struct data_reference *dr,
1123 unsigned int *inside_cost,
1124 unsigned int *outside_cost,
1125 stmt_vector_for_cost *body_cost_vec)
1127 gimple *stmt = DR_STMT (dr);
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1129 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1130 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1131 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1132 int ncopies = vf / nunits;
1134 if (DR_IS_READ (dr))
1135 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1136 NULL, body_cost_vec, false);
1137 else
1138 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1140 if (dump_enabled_p ())
1141 dump_printf_loc (MSG_NOTE, vect_location,
1142 "vect_get_data_access_cost: inside_cost = %d, "
1143 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1147 typedef struct _vect_peel_info
1149 struct data_reference *dr;
1150 int npeel;
1151 unsigned int count;
1152 } *vect_peel_info;
1154 typedef struct _vect_peel_extended_info
1156 struct _vect_peel_info peel_info;
1157 unsigned int inside_cost;
1158 unsigned int outside_cost;
1159 stmt_vector_for_cost body_cost_vec;
1160 } *vect_peel_extended_info;
1163 /* Peeling hashtable helpers. */
1165 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1167 static inline hashval_t hash (const _vect_peel_info *);
1168 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1171 inline hashval_t
1172 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1174 return (hashval_t) peel_info->npeel;
1177 inline bool
1178 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1180 return (a->npeel == b->npeel);
1184 /* Insert DR into peeling hash table with NPEEL as key. */
1186 static void
1187 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1188 loop_vec_info loop_vinfo, struct data_reference *dr,
1189 int npeel)
1191 struct _vect_peel_info elem, *slot;
1192 _vect_peel_info **new_slot;
1193 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1195 elem.npeel = npeel;
1196 slot = peeling_htab->find (&elem);
1197 if (slot)
1198 slot->count++;
1199 else
1201 slot = XNEW (struct _vect_peel_info);
1202 slot->npeel = npeel;
1203 slot->dr = dr;
1204 slot->count = 1;
1205 new_slot = peeling_htab->find_slot (slot, INSERT);
1206 *new_slot = slot;
1209 if (!supportable_dr_alignment
1210 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1211 slot->count += VECT_MAX_COST;
1215 /* Traverse peeling hash table to find peeling option that aligns maximum
1216 number of data accesses. */
1219 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1220 _vect_peel_extended_info *max)
1222 vect_peel_info elem = *slot;
1224 if (elem->count > max->peel_info.count
1225 || (elem->count == max->peel_info.count
1226 && max->peel_info.npeel > elem->npeel))
1228 max->peel_info.npeel = elem->npeel;
1229 max->peel_info.count = elem->count;
1230 max->peel_info.dr = elem->dr;
1233 return 1;
1237 /* Traverse peeling hash table and calculate cost for each peeling option.
1238 Find the one with the lowest cost. */
1241 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1242 _vect_peel_extended_info *min)
1244 vect_peel_info elem = *slot;
1245 int save_misalignment, dummy;
1246 unsigned int inside_cost = 0, outside_cost = 0, i;
1247 gimple *stmt = DR_STMT (elem->dr);
1248 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1249 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1250 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1251 struct data_reference *dr;
1252 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1254 prologue_cost_vec.create (2);
1255 body_cost_vec.create (2);
1256 epilogue_cost_vec.create (2);
1258 FOR_EACH_VEC_ELT (datarefs, i, dr)
1260 stmt = DR_STMT (dr);
1261 stmt_info = vinfo_for_stmt (stmt);
1262 /* For interleaving, only the alignment of the first access
1263 matters. */
1264 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1265 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1266 continue;
1268 /* Strided accesses perform only component accesses, alignment is
1269 irrelevant for them. */
1270 if (STMT_VINFO_STRIDED_P (stmt_info)
1271 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1272 continue;
1274 save_misalignment = DR_MISALIGNMENT (dr);
1275 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1276 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1277 &body_cost_vec);
1278 SET_DR_MISALIGNMENT (dr, save_misalignment);
1281 outside_cost += vect_get_known_peeling_cost
1282 (loop_vinfo, elem->npeel, &dummy,
1283 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1284 &prologue_cost_vec, &epilogue_cost_vec);
1286 /* Prologue and epilogue costs are added to the target model later.
1287 These costs depend only on the scalar iteration cost, the
1288 number of peeling iterations finally chosen, and the number of
1289 misaligned statements. So discard the information found here. */
1290 prologue_cost_vec.release ();
1291 epilogue_cost_vec.release ();
1293 if (inside_cost < min->inside_cost
1294 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1296 min->inside_cost = inside_cost;
1297 min->outside_cost = outside_cost;
1298 min->body_cost_vec.release ();
1299 min->body_cost_vec = body_cost_vec;
1300 min->peel_info.dr = elem->dr;
1301 min->peel_info.npeel = elem->npeel;
1303 else
1304 body_cost_vec.release ();
1306 return 1;
1310 /* Choose best peeling option by traversing peeling hash table and either
1311 choosing an option with the lowest cost (if cost model is enabled) or the
1312 option that aligns as many accesses as possible. */
1314 static struct data_reference *
1315 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1316 loop_vec_info loop_vinfo,
1317 unsigned int *npeel,
1318 stmt_vector_for_cost *body_cost_vec)
1320 struct _vect_peel_extended_info res;
1322 res.peel_info.dr = NULL;
1323 res.body_cost_vec = stmt_vector_for_cost ();
1325 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1327 res.inside_cost = INT_MAX;
1328 res.outside_cost = INT_MAX;
1329 peeling_htab->traverse <_vect_peel_extended_info *,
1330 vect_peeling_hash_get_lowest_cost> (&res);
1332 else
1334 res.peel_info.count = 0;
1335 peeling_htab->traverse <_vect_peel_extended_info *,
1336 vect_peeling_hash_get_most_frequent> (&res);
1339 *npeel = res.peel_info.npeel;
1340 *body_cost_vec = res.body_cost_vec;
1341 return res.peel_info.dr;
1345 /* Function vect_enhance_data_refs_alignment
1347 This pass will use loop versioning and loop peeling in order to enhance
1348 the alignment of data references in the loop.
1350 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1351 original loop is to be vectorized. Any other loops that are created by
1352 the transformations performed in this pass - are not supposed to be
1353 vectorized. This restriction will be relaxed.
1355 This pass will require a cost model to guide it whether to apply peeling
1356 or versioning or a combination of the two. For example, the scheme that
1357 intel uses when given a loop with several memory accesses, is as follows:
1358 choose one memory access ('p') which alignment you want to force by doing
1359 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1360 other accesses are not necessarily aligned, or (2) use loop versioning to
1361 generate one loop in which all accesses are aligned, and another loop in
1362 which only 'p' is necessarily aligned.
1364 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1365 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1366 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1368 Devising a cost model is the most critical aspect of this work. It will
1369 guide us on which access to peel for, whether to use loop versioning, how
1370 many versions to create, etc. The cost model will probably consist of
1371 generic considerations as well as target specific considerations (on
1372 powerpc for example, misaligned stores are more painful than misaligned
1373 loads).
1375 Here are the general steps involved in alignment enhancements:
1377 -- original loop, before alignment analysis:
1378 for (i=0; i<N; i++){
1379 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1380 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1383 -- After vect_compute_data_refs_alignment:
1384 for (i=0; i<N; i++){
1385 x = q[i]; # DR_MISALIGNMENT(q) = 3
1386 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1389 -- Possibility 1: we do loop versioning:
1390 if (p is aligned) {
1391 for (i=0; i<N; i++){ # loop 1A
1392 x = q[i]; # DR_MISALIGNMENT(q) = 3
1393 p[i] = y; # DR_MISALIGNMENT(p) = 0
1396 else {
1397 for (i=0; i<N; i++){ # loop 1B
1398 x = q[i]; # DR_MISALIGNMENT(q) = 3
1399 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1403 -- Possibility 2: we do loop peeling:
1404 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1405 x = q[i];
1406 p[i] = y;
1408 for (i = 3; i < N; i++){ # loop 2A
1409 x = q[i]; # DR_MISALIGNMENT(q) = 0
1410 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1413 -- Possibility 3: combination of loop peeling and versioning:
1414 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1415 x = q[i];
1416 p[i] = y;
1418 if (p is aligned) {
1419 for (i = 3; i<N; i++){ # loop 3A
1420 x = q[i]; # DR_MISALIGNMENT(q) = 0
1421 p[i] = y; # DR_MISALIGNMENT(p) = 0
1424 else {
1425 for (i = 3; i<N; i++){ # loop 3B
1426 x = q[i]; # DR_MISALIGNMENT(q) = 0
1427 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1431 These loops are later passed to loop_transform to be vectorized. The
1432 vectorizer will use the alignment information to guide the transformation
1433 (whether to generate regular loads/stores, or with special handling for
1434 misalignment). */
1436 bool
1437 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1439 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1440 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1441 enum dr_alignment_support supportable_dr_alignment;
1442 struct data_reference *dr0 = NULL, *first_store = NULL;
1443 struct data_reference *dr;
1444 unsigned int i, j;
1445 bool do_peeling = false;
1446 bool do_versioning = false;
1447 bool stat;
1448 gimple *stmt;
1449 stmt_vec_info stmt_info;
1450 unsigned int npeel = 0;
1451 bool all_misalignments_unknown = true;
1452 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1453 unsigned possible_npeel_number = 1;
1454 tree vectype;
1455 unsigned int nelements, mis, same_align_drs_max = 0;
1456 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1457 hash_table<peel_info_hasher> peeling_htab (1);
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_NOTE, vect_location,
1461 "=== vect_enhance_data_refs_alignment ===\n");
1463 /* Reset data so we can safely be called multiple times. */
1464 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1465 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1467 /* While cost model enhancements are expected in the future, the high level
1468 view of the code at this time is as follows:
1470 A) If there is a misaligned access then see if peeling to align
1471 this access can make all data references satisfy
1472 vect_supportable_dr_alignment. If so, update data structures
1473 as needed and return true.
1475 B) If peeling wasn't possible and there is a data reference with an
1476 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1477 then see if loop versioning checks can be used to make all data
1478 references satisfy vect_supportable_dr_alignment. If so, update
1479 data structures as needed and return true.
1481 C) If neither peeling nor versioning were successful then return false if
1482 any data reference does not satisfy vect_supportable_dr_alignment.
1484 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1486 Note, Possibility 3 above (which is peeling and versioning together) is not
1487 being done at this time. */
1489 /* (1) Peeling to force alignment. */
1491 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1492 Considerations:
1493 + How many accesses will become aligned due to the peeling
1494 - How many accesses will become unaligned due to the peeling,
1495 and the cost of misaligned accesses.
1496 - The cost of peeling (the extra runtime checks, the increase
1497 in code size). */
1499 FOR_EACH_VEC_ELT (datarefs, i, dr)
1501 stmt = DR_STMT (dr);
1502 stmt_info = vinfo_for_stmt (stmt);
1504 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1505 continue;
1507 /* For interleaving, only the alignment of the first access
1508 matters. */
1509 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1510 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1511 continue;
1513 /* For invariant accesses there is nothing to enhance. */
1514 if (integer_zerop (DR_STEP (dr)))
1515 continue;
1517 /* Strided accesses perform only component accesses, alignment is
1518 irrelevant for them. */
1519 if (STMT_VINFO_STRIDED_P (stmt_info)
1520 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1521 continue;
1523 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1524 do_peeling = vector_alignment_reachable_p (dr);
1525 if (do_peeling)
1527 if (known_alignment_for_access_p (dr))
1529 unsigned int npeel_tmp;
1530 bool negative = tree_int_cst_compare (DR_STEP (dr),
1531 size_zero_node) < 0;
1533 /* Save info about DR in the hash table. */
1534 vectype = STMT_VINFO_VECTYPE (stmt_info);
1535 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1536 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1537 TREE_TYPE (DR_REF (dr))));
1538 npeel_tmp = (negative
1539 ? (mis - nelements) : (nelements - mis))
1540 & (nelements - 1);
1542 /* For multiple types, it is possible that the bigger type access
1543 will have more than one peeling option. E.g., a loop with two
1544 types: one of size (vector size / 4), and the other one of
1545 size (vector size / 8). Vectorization factor will 8. If both
1546 access are misaligned by 3, the first one needs one scalar
1547 iteration to be aligned, and the second one needs 5. But the
1548 first one will be aligned also by peeling 5 scalar
1549 iterations, and in that case both accesses will be aligned.
1550 Hence, except for the immediate peeling amount, we also want
1551 to try to add full vector size, while we don't exceed
1552 vectorization factor.
1553 We do this automatically for cost model, since we calculate cost
1554 for every peeling option. */
1555 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1557 if (STMT_SLP_TYPE (stmt_info))
1558 possible_npeel_number
1559 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1560 else
1561 possible_npeel_number = vf / nelements;
1564 /* Handle the aligned case. We may decide to align some other
1565 access, making DR unaligned. */
1566 if (DR_MISALIGNMENT (dr) == 0)
1568 npeel_tmp = 0;
1569 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1570 possible_npeel_number++;
1573 for (j = 0; j < possible_npeel_number; j++)
1575 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1576 dr, npeel_tmp);
1577 npeel_tmp += nelements;
1580 all_misalignments_unknown = false;
1581 /* Data-ref that was chosen for the case that all the
1582 misalignments are unknown is not relevant anymore, since we
1583 have a data-ref with known alignment. */
1584 dr0 = NULL;
1586 else
1588 /* If we don't know any misalignment values, we prefer
1589 peeling for data-ref that has the maximum number of data-refs
1590 with the same alignment, unless the target prefers to align
1591 stores over load. */
1592 if (all_misalignments_unknown)
1594 unsigned same_align_drs
1595 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1596 if (!dr0
1597 || same_align_drs_max < same_align_drs)
1599 same_align_drs_max = same_align_drs;
1600 dr0 = dr;
1602 /* For data-refs with the same number of related
1603 accesses prefer the one where the misalign
1604 computation will be invariant in the outermost loop. */
1605 else if (same_align_drs_max == same_align_drs)
1607 struct loop *ivloop0, *ivloop;
1608 ivloop0 = outermost_invariant_loop_for_expr
1609 (loop, DR_BASE_ADDRESS (dr0));
1610 ivloop = outermost_invariant_loop_for_expr
1611 (loop, DR_BASE_ADDRESS (dr));
1612 if ((ivloop && !ivloop0)
1613 || (ivloop && ivloop0
1614 && flow_loop_nested_p (ivloop, ivloop0)))
1615 dr0 = dr;
1618 if (!first_store && DR_IS_WRITE (dr))
1619 first_store = dr;
1622 /* If there are both known and unknown misaligned accesses in the
1623 loop, we choose peeling amount according to the known
1624 accesses. */
1625 if (!supportable_dr_alignment)
1627 dr0 = dr;
1628 if (!first_store && DR_IS_WRITE (dr))
1629 first_store = dr;
1633 else
1635 if (!aligned_access_p (dr))
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "vector alignment may not be reachable\n");
1640 break;
1645 /* Check if we can possibly peel the loop. */
1646 if (!vect_can_advance_ivs_p (loop_vinfo)
1647 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1648 || loop->inner)
1649 do_peeling = false;
1651 if (do_peeling
1652 && all_misalignments_unknown
1653 && vect_supportable_dr_alignment (dr0, false))
1655 /* Check if the target requires to prefer stores over loads, i.e., if
1656 misaligned stores are more expensive than misaligned loads (taking
1657 drs with same alignment into account). */
1658 if (first_store && DR_IS_READ (dr0))
1660 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1661 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1662 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1663 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1664 stmt_vector_for_cost dummy;
1665 dummy.create (2);
1667 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1668 &dummy);
1669 vect_get_data_access_cost (first_store, &store_inside_cost,
1670 &store_outside_cost, &dummy);
1672 dummy.release ();
1674 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1675 aligning the load DR0). */
1676 load_inside_penalty = store_inside_cost;
1677 load_outside_penalty = store_outside_cost;
1678 for (i = 0;
1679 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1680 DR_STMT (first_store))).iterate (i, &dr);
1681 i++)
1682 if (DR_IS_READ (dr))
1684 load_inside_penalty += load_inside_cost;
1685 load_outside_penalty += load_outside_cost;
1687 else
1689 load_inside_penalty += store_inside_cost;
1690 load_outside_penalty += store_outside_cost;
1693 /* Calculate the penalty for leaving DR0 unaligned (by
1694 aligning the FIRST_STORE). */
1695 store_inside_penalty = load_inside_cost;
1696 store_outside_penalty = load_outside_cost;
1697 for (i = 0;
1698 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1699 DR_STMT (dr0))).iterate (i, &dr);
1700 i++)
1701 if (DR_IS_READ (dr))
1703 store_inside_penalty += load_inside_cost;
1704 store_outside_penalty += load_outside_cost;
1706 else
1708 store_inside_penalty += store_inside_cost;
1709 store_outside_penalty += store_outside_cost;
1712 if (load_inside_penalty > store_inside_penalty
1713 || (load_inside_penalty == store_inside_penalty
1714 && load_outside_penalty > store_outside_penalty))
1715 dr0 = first_store;
1718 /* Use peeling only if it may help to align other accesses in the loop or
1719 if it may help improving load bandwith when we'd end up using
1720 unaligned loads. */
1721 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1722 if (STMT_VINFO_SAME_ALIGN_REFS
1723 (vinfo_for_stmt (DR_STMT (dr0))).length () == 0
1724 && (vect_supportable_dr_alignment (dr0, false)
1725 != dr_unaligned_supported
1726 || (DR_IS_READ (dr0)
1727 && (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1728 == builtin_vectorization_cost (unaligned_load,
1729 dr0_vt, -1)))))
1730 do_peeling = false;
1733 if (do_peeling && !dr0)
1735 /* Peeling is possible, but there is no data access that is not supported
1736 unless aligned. So we try to choose the best possible peeling. */
1738 /* We should get here only if there are drs with known misalignment. */
1739 gcc_assert (!all_misalignments_unknown);
1741 /* Choose the best peeling from the hash table. */
1742 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1743 loop_vinfo, &npeel,
1744 &body_cost_vec);
1745 if (!dr0 || !npeel)
1746 do_peeling = false;
1749 if (do_peeling)
1751 stmt = DR_STMT (dr0);
1752 stmt_info = vinfo_for_stmt (stmt);
1753 vectype = STMT_VINFO_VECTYPE (stmt_info);
1754 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1756 if (known_alignment_for_access_p (dr0))
1758 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1759 size_zero_node) < 0;
1760 if (!npeel)
1762 /* Since it's known at compile time, compute the number of
1763 iterations in the peeled loop (the peeling factor) for use in
1764 updating DR_MISALIGNMENT values. The peeling factor is the
1765 vectorization factor minus the misalignment as an element
1766 count. */
1767 mis = DR_MISALIGNMENT (dr0);
1768 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1769 npeel = ((negative ? mis - nelements : nelements - mis)
1770 & (nelements - 1));
1773 /* For interleaved data access every iteration accesses all the
1774 members of the group, therefore we divide the number of iterations
1775 by the group size. */
1776 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1777 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1778 npeel /= GROUP_SIZE (stmt_info);
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Try peeling by %d\n", npeel);
1785 /* Ensure that all data refs can be vectorized after the peel. */
1786 FOR_EACH_VEC_ELT (datarefs, i, dr)
1788 int save_misalignment;
1790 if (dr == dr0)
1791 continue;
1793 stmt = DR_STMT (dr);
1794 stmt_info = vinfo_for_stmt (stmt);
1795 /* For interleaving, only the alignment of the first access
1796 matters. */
1797 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1798 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1799 continue;
1801 /* Strided accesses perform only component accesses, alignment is
1802 irrelevant for them. */
1803 if (STMT_VINFO_STRIDED_P (stmt_info)
1804 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1805 continue;
1807 save_misalignment = DR_MISALIGNMENT (dr);
1808 vect_update_misalignment_for_peel (dr, dr0, npeel);
1809 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1810 SET_DR_MISALIGNMENT (dr, save_misalignment);
1812 if (!supportable_dr_alignment)
1814 do_peeling = false;
1815 break;
1819 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1821 stat = vect_verify_datarefs_alignment (loop_vinfo);
1822 if (!stat)
1823 do_peeling = false;
1824 else
1826 body_cost_vec.release ();
1827 return stat;
1831 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1832 if (do_peeling)
1834 unsigned max_allowed_peel
1835 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1836 if (max_allowed_peel != (unsigned)-1)
1838 unsigned max_peel = npeel;
1839 if (max_peel == 0)
1841 gimple *dr_stmt = DR_STMT (dr0);
1842 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1843 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1844 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1846 if (max_peel > max_allowed_peel)
1848 do_peeling = false;
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE, vect_location,
1851 "Disable peeling, max peels reached: %d\n", max_peel);
1856 /* Cost model #2 - if peeling may result in a remaining loop not
1857 iterating enough to be vectorized then do not peel. */
1858 if (do_peeling
1859 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1861 unsigned max_peel
1862 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1863 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1864 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1865 do_peeling = false;
1868 if (do_peeling)
1870 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1871 If the misalignment of DR_i is identical to that of dr0 then set
1872 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1873 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1874 by the peeling factor times the element size of DR_i (MOD the
1875 vectorization factor times the size). Otherwise, the
1876 misalignment of DR_i must be set to unknown. */
1877 FOR_EACH_VEC_ELT (datarefs, i, dr)
1878 if (dr != dr0)
1880 /* Strided accesses perform only component accesses, alignment
1881 is irrelevant for them. */
1882 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1883 if (STMT_VINFO_STRIDED_P (stmt_info)
1884 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1885 continue;
1887 vect_update_misalignment_for_peel (dr, dr0, npeel);
1890 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1891 if (npeel)
1892 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1893 else
1894 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1895 = DR_MISALIGNMENT (dr0);
1896 SET_DR_MISALIGNMENT (dr0, 0);
1897 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE, vect_location,
1900 "Alignment of access forced using peeling.\n");
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "Peeling for alignment will be applied.\n");
1904 /* The inside-loop cost will be accounted for in vectorizable_load
1905 and vectorizable_store correctly with adjusted alignments.
1906 Drop the body_cst_vec on the floor here. */
1907 body_cost_vec.release ();
1909 stat = vect_verify_datarefs_alignment (loop_vinfo);
1910 gcc_assert (stat);
1911 return stat;
1915 body_cost_vec.release ();
1917 /* (2) Versioning to force alignment. */
1919 /* Try versioning if:
1920 1) optimize loop for speed
1921 2) there is at least one unsupported misaligned data ref with an unknown
1922 misalignment, and
1923 3) all misaligned data refs with a known misalignment are supported, and
1924 4) the number of runtime alignment checks is within reason. */
1926 do_versioning =
1927 optimize_loop_nest_for_speed_p (loop)
1928 && (!loop->inner); /* FORNOW */
1930 if (do_versioning)
1932 FOR_EACH_VEC_ELT (datarefs, i, dr)
1934 stmt = DR_STMT (dr);
1935 stmt_info = vinfo_for_stmt (stmt);
1937 /* For interleaving, only the alignment of the first access
1938 matters. */
1939 if (aligned_access_p (dr)
1940 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1941 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1942 continue;
1944 if (STMT_VINFO_STRIDED_P (stmt_info))
1946 /* Strided loads perform only component accesses, alignment is
1947 irrelevant for them. */
1948 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1949 continue;
1950 do_versioning = false;
1951 break;
1954 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1956 if (!supportable_dr_alignment)
1958 gimple *stmt;
1959 int mask;
1960 tree vectype;
1962 if (known_alignment_for_access_p (dr)
1963 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1964 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1966 do_versioning = false;
1967 break;
1970 stmt = DR_STMT (dr);
1971 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1972 gcc_assert (vectype);
1974 /* The rightmost bits of an aligned address must be zeros.
1975 Construct the mask needed for this test. For example,
1976 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1977 mask must be 15 = 0xf. */
1978 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1980 /* FORNOW: use the same mask to test all potentially unaligned
1981 references in the loop. The vectorizer currently supports
1982 a single vector size, see the reference to
1983 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1984 vectorization factor is computed. */
1985 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1986 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1987 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1988 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1989 DR_STMT (dr));
1993 /* Versioning requires at least one misaligned data reference. */
1994 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1995 do_versioning = false;
1996 else if (!do_versioning)
1997 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2000 if (do_versioning)
2002 vec<gimple *> may_misalign_stmts
2003 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2004 gimple *stmt;
2006 /* It can now be assumed that the data references in the statements
2007 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2008 of the loop being vectorized. */
2009 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2011 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2012 dr = STMT_VINFO_DATA_REF (stmt_info);
2013 SET_DR_MISALIGNMENT (dr, 0);
2014 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE, vect_location,
2016 "Alignment of access forced using versioning.\n");
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE, vect_location,
2021 "Versioning for alignment will be applied.\n");
2023 /* Peeling and versioning can't be done together at this time. */
2024 gcc_assert (! (do_peeling && do_versioning));
2026 stat = vect_verify_datarefs_alignment (loop_vinfo);
2027 gcc_assert (stat);
2028 return stat;
2031 /* This point is reached if neither peeling nor versioning is being done. */
2032 gcc_assert (! (do_peeling || do_versioning));
2034 stat = vect_verify_datarefs_alignment (loop_vinfo);
2035 return stat;
2039 /* Function vect_find_same_alignment_drs.
2041 Update group and alignment relations according to the chosen
2042 vectorization factor. */
2044 static void
2045 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2046 loop_vec_info loop_vinfo)
2048 unsigned int i;
2049 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2051 struct data_reference *dra = DDR_A (ddr);
2052 struct data_reference *drb = DDR_B (ddr);
2053 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2054 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2055 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2056 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2057 lambda_vector dist_v;
2058 unsigned int loop_depth;
2060 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2061 return;
2063 if (dra == drb)
2064 return;
2066 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2067 return;
2069 /* Loop-based vectorization and known data dependence. */
2070 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2071 return;
2073 /* Data-dependence analysis reports a distance vector of zero
2074 for data-references that overlap only in the first iteration
2075 but have different sign step (see PR45764).
2076 So as a sanity check require equal DR_STEP. */
2077 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2078 return;
2080 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2081 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2083 int dist = dist_v[loop_depth];
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE, vect_location,
2087 "dependence distance = %d.\n", dist);
2089 /* Same loop iteration. */
2090 if (dist == 0
2091 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2093 /* Two references with distance zero have the same alignment. */
2094 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2095 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2096 if (dump_enabled_p ())
2098 dump_printf_loc (MSG_NOTE, vect_location,
2099 "accesses have the same alignment.\n");
2100 dump_printf (MSG_NOTE,
2101 "dependence distance modulo vf == 0 between ");
2102 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2103 dump_printf (MSG_NOTE, " and ");
2104 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2105 dump_printf (MSG_NOTE, "\n");
2112 /* Function vect_analyze_data_refs_alignment
2114 Analyze the alignment of the data-references in the loop.
2115 Return FALSE if a data reference is found that cannot be vectorized. */
2117 bool
2118 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2120 if (dump_enabled_p ())
2121 dump_printf_loc (MSG_NOTE, vect_location,
2122 "=== vect_analyze_data_refs_alignment ===\n");
2124 /* Mark groups of data references with same alignment using
2125 data dependence information. */
2126 vec<ddr_p> ddrs = vinfo->ddrs;
2127 struct data_dependence_relation *ddr;
2128 unsigned int i;
2130 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2131 vect_find_same_alignment_drs (ddr, vinfo);
2133 vec<data_reference_p> datarefs = vinfo->datarefs;
2134 struct data_reference *dr;
2136 FOR_EACH_VEC_ELT (datarefs, i, dr)
2138 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2139 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2140 && !vect_compute_data_ref_alignment (dr))
2142 /* Strided accesses perform only component accesses, misalignment
2143 information is irrelevant for them. */
2144 if (STMT_VINFO_STRIDED_P (stmt_info)
2145 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2146 continue;
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "not vectorized: can't calculate alignment "
2151 "for data ref.\n");
2153 return false;
2157 return true;
2161 /* Analyze alignment of DRs of stmts in NODE. */
2163 static bool
2164 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2166 /* We vectorize from the first scalar stmt in the node unless
2167 the node is permuted in which case we start from the first
2168 element in the group. */
2169 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2170 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2171 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2172 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2174 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2175 if (! vect_compute_data_ref_alignment (dr)
2176 /* For creating the data-ref pointer we need alignment of the
2177 first element anyway. */
2178 || (dr != first_dr
2179 && ! vect_compute_data_ref_alignment (first_dr))
2180 || ! verify_data_ref_alignment (dr))
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "not vectorized: bad data alignment in basic "
2185 "block.\n");
2186 return false;
2189 return true;
2192 /* Function vect_slp_analyze_instance_alignment
2194 Analyze the alignment of the data-references in the SLP instance.
2195 Return FALSE if a data reference is found that cannot be vectorized. */
2197 bool
2198 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2200 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_NOTE, vect_location,
2202 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2204 slp_tree node;
2205 unsigned i;
2206 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2207 if (! vect_slp_analyze_and_verify_node_alignment (node))
2208 return false;
2210 node = SLP_INSTANCE_TREE (instance);
2211 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2212 && ! vect_slp_analyze_and_verify_node_alignment
2213 (SLP_INSTANCE_TREE (instance)))
2214 return false;
2216 return true;
2220 /* Analyze groups of accesses: check that DR belongs to a group of
2221 accesses of legal size, step, etc. Detect gaps, single element
2222 interleaving, and other special cases. Set grouped access info.
2223 Collect groups of strided stores for further use in SLP analysis.
2224 Worker for vect_analyze_group_access. */
2226 static bool
2227 vect_analyze_group_access_1 (struct data_reference *dr)
2229 tree step = DR_STEP (dr);
2230 tree scalar_type = TREE_TYPE (DR_REF (dr));
2231 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2232 gimple *stmt = DR_STMT (dr);
2233 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2234 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2235 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2236 HOST_WIDE_INT dr_step = -1;
2237 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2238 bool slp_impossible = false;
2240 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2241 size of the interleaving group (including gaps). */
2242 if (tree_fits_shwi_p (step))
2244 dr_step = tree_to_shwi (step);
2245 /* Check that STEP is a multiple of type size. Otherwise there is
2246 a non-element-sized gap at the end of the group which we
2247 cannot represent in GROUP_GAP or GROUP_SIZE.
2248 ??? As we can handle non-constant step fine here we should
2249 simply remove uses of GROUP_GAP between the last and first
2250 element and instead rely on DR_STEP. GROUP_SIZE then would
2251 simply not include that gap. */
2252 if ((dr_step % type_size) != 0)
2254 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE, vect_location,
2257 "Step ");
2258 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2259 dump_printf (MSG_NOTE,
2260 " is not a multiple of the element size for ");
2261 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2262 dump_printf (MSG_NOTE, "\n");
2264 return false;
2266 groupsize = absu_hwi (dr_step) / type_size;
2268 else
2269 groupsize = 0;
2271 /* Not consecutive access is possible only if it is a part of interleaving. */
2272 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2274 /* Check if it this DR is a part of interleaving, and is a single
2275 element of the group that is accessed in the loop. */
2277 /* Gaps are supported only for loads. STEP must be a multiple of the type
2278 size. The size of the group must be a power of 2. */
2279 if (DR_IS_READ (dr)
2280 && (dr_step % type_size) == 0
2281 && groupsize > 0
2282 && pow2p_hwi (groupsize))
2284 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2285 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2286 GROUP_GAP (stmt_info) = groupsize - 1;
2287 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE, vect_location,
2290 "Detected single element interleaving ");
2291 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2292 dump_printf (MSG_NOTE, " step ");
2293 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2294 dump_printf (MSG_NOTE, "\n");
2297 return true;
2300 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "not consecutive access ");
2304 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2307 if (bb_vinfo)
2309 /* Mark the statement as unvectorizable. */
2310 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2311 return true;
2314 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2315 STMT_VINFO_STRIDED_P (stmt_info) = true;
2316 return true;
2319 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2321 /* First stmt in the interleaving chain. Check the chain. */
2322 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2323 struct data_reference *data_ref = dr;
2324 unsigned int count = 1;
2325 tree prev_init = DR_INIT (data_ref);
2326 gimple *prev = stmt;
2327 HOST_WIDE_INT diff, gaps = 0;
2329 while (next)
2331 /* Skip same data-refs. In case that two or more stmts share
2332 data-ref (supported only for loads), we vectorize only the first
2333 stmt, and the rest get their vectorized loads from the first
2334 one. */
2335 if (!tree_int_cst_compare (DR_INIT (data_ref),
2336 DR_INIT (STMT_VINFO_DATA_REF (
2337 vinfo_for_stmt (next)))))
2339 if (DR_IS_WRITE (data_ref))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "Two store stmts share the same dr.\n");
2344 return false;
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349 "Two or more load stmts share the same dr.\n");
2351 /* For load use the same data-ref load. */
2352 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2354 prev = next;
2355 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2356 continue;
2359 prev = next;
2360 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2362 /* All group members have the same STEP by construction. */
2363 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2365 /* Check that the distance between two accesses is equal to the type
2366 size. Otherwise, we have gaps. */
2367 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2368 - TREE_INT_CST_LOW (prev_init)) / type_size;
2369 if (diff != 1)
2371 /* FORNOW: SLP of accesses with gaps is not supported. */
2372 slp_impossible = true;
2373 if (DR_IS_WRITE (data_ref))
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 "interleaved store with gaps\n");
2378 return false;
2381 gaps += diff - 1;
2384 last_accessed_element += diff;
2386 /* Store the gap from the previous member of the group. If there is no
2387 gap in the access, GROUP_GAP is always 1. */
2388 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2390 prev_init = DR_INIT (data_ref);
2391 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2392 /* Count the number of data-refs in the chain. */
2393 count++;
2396 if (groupsize == 0)
2397 groupsize = count + gaps;
2399 /* This could be UINT_MAX but as we are generating code in a very
2400 inefficient way we have to cap earlier. See PR78699 for example. */
2401 if (groupsize > 4096)
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 "group is too large\n");
2406 return false;
2409 /* Check that the size of the interleaving is equal to count for stores,
2410 i.e., that there are no gaps. */
2411 if (groupsize != count
2412 && !DR_IS_READ (dr))
2414 if (dump_enabled_p ())
2415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2416 "interleaved store with gaps\n");
2417 return false;
2420 /* If there is a gap after the last load in the group it is the
2421 difference between the groupsize and the last accessed
2422 element.
2423 When there is no gap, this difference should be 0. */
2424 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2426 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2427 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_NOTE, vect_location,
2430 "Detected interleaving ");
2431 if (DR_IS_READ (dr))
2432 dump_printf (MSG_NOTE, "load ");
2433 else
2434 dump_printf (MSG_NOTE, "store ");
2435 dump_printf (MSG_NOTE, "of size %u starting with ",
2436 (unsigned)groupsize);
2437 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2438 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2439 dump_printf_loc (MSG_NOTE, vect_location,
2440 "There is a gap of %u elements after the group\n",
2441 GROUP_GAP (vinfo_for_stmt (stmt)));
2444 /* SLP: create an SLP data structure for every interleaving group of
2445 stores for further analysis in vect_analyse_slp. */
2446 if (DR_IS_WRITE (dr) && !slp_impossible)
2448 if (loop_vinfo)
2449 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2450 if (bb_vinfo)
2451 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2455 return true;
2458 /* Analyze groups of accesses: check that DR belongs to a group of
2459 accesses of legal size, step, etc. Detect gaps, single element
2460 interleaving, and other special cases. Set grouped access info.
2461 Collect groups of strided stores for further use in SLP analysis. */
2463 static bool
2464 vect_analyze_group_access (struct data_reference *dr)
2466 if (!vect_analyze_group_access_1 (dr))
2468 /* Dissolve the group if present. */
2469 gimple *next;
2470 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2471 while (stmt)
2473 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2474 next = GROUP_NEXT_ELEMENT (vinfo);
2475 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2476 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2477 stmt = next;
2479 return false;
2481 return true;
2484 /* Analyze the access pattern of the data-reference DR.
2485 In case of non-consecutive accesses call vect_analyze_group_access() to
2486 analyze groups of accesses. */
2488 static bool
2489 vect_analyze_data_ref_access (struct data_reference *dr)
2491 tree step = DR_STEP (dr);
2492 tree scalar_type = TREE_TYPE (DR_REF (dr));
2493 gimple *stmt = DR_STMT (dr);
2494 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2495 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2496 struct loop *loop = NULL;
2498 if (loop_vinfo)
2499 loop = LOOP_VINFO_LOOP (loop_vinfo);
2501 if (loop_vinfo && !step)
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2505 "bad data-ref access in loop\n");
2506 return false;
2509 /* Allow loads with zero step in inner-loop vectorization. */
2510 if (loop_vinfo && integer_zerop (step))
2512 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2513 if (!nested_in_vect_loop_p (loop, stmt))
2514 return DR_IS_READ (dr);
2515 /* Allow references with zero step for outer loops marked
2516 with pragma omp simd only - it guarantees absence of
2517 loop-carried dependencies between inner loop iterations. */
2518 if (!loop->force_vectorize)
2520 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_NOTE, vect_location,
2522 "zero step in inner loop of nest\n");
2523 return false;
2527 if (loop && nested_in_vect_loop_p (loop, stmt))
2529 /* Interleaved accesses are not yet supported within outer-loop
2530 vectorization for references in the inner-loop. */
2531 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2533 /* For the rest of the analysis we use the outer-loop step. */
2534 step = STMT_VINFO_DR_STEP (stmt_info);
2535 if (integer_zerop (step))
2537 if (dump_enabled_p ())
2538 dump_printf_loc (MSG_NOTE, vect_location,
2539 "zero step in outer loop.\n");
2540 return DR_IS_READ (dr);
2544 /* Consecutive? */
2545 if (TREE_CODE (step) == INTEGER_CST)
2547 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2548 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2549 || (dr_step < 0
2550 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2552 /* Mark that it is not interleaving. */
2553 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2554 return true;
2558 if (loop && nested_in_vect_loop_p (loop, stmt))
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE, vect_location,
2562 "grouped access in outer loop.\n");
2563 return false;
2567 /* Assume this is a DR handled by non-constant strided load case. */
2568 if (TREE_CODE (step) != INTEGER_CST)
2569 return (STMT_VINFO_STRIDED_P (stmt_info)
2570 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2571 || vect_analyze_group_access (dr)));
2573 /* Not consecutive access - check if it's a part of interleaving group. */
2574 return vect_analyze_group_access (dr);
2579 /* A helper function used in the comparator function to sort data
2580 references. T1 and T2 are two data references to be compared.
2581 The function returns -1, 0, or 1. */
2583 static int
2584 compare_tree (tree t1, tree t2)
2586 int i, cmp;
2587 enum tree_code code;
2588 char tclass;
2590 if (t1 == t2)
2591 return 0;
2592 if (t1 == NULL)
2593 return -1;
2594 if (t2 == NULL)
2595 return 1;
2597 STRIP_NOPS (t1);
2598 STRIP_NOPS (t2);
2600 if (TREE_CODE (t1) != TREE_CODE (t2))
2601 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2603 code = TREE_CODE (t1);
2604 switch (code)
2606 /* For const values, we can just use hash values for comparisons. */
2607 case INTEGER_CST:
2608 case REAL_CST:
2609 case FIXED_CST:
2610 case STRING_CST:
2611 case COMPLEX_CST:
2612 case VECTOR_CST:
2614 hashval_t h1 = iterative_hash_expr (t1, 0);
2615 hashval_t h2 = iterative_hash_expr (t2, 0);
2616 if (h1 != h2)
2617 return h1 < h2 ? -1 : 1;
2618 break;
2621 case SSA_NAME:
2622 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2623 if (cmp != 0)
2624 return cmp;
2626 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2627 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2628 break;
2630 default:
2631 tclass = TREE_CODE_CLASS (code);
2633 /* For var-decl, we could compare their UIDs. */
2634 if (tclass == tcc_declaration)
2636 if (DECL_UID (t1) != DECL_UID (t2))
2637 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2638 break;
2641 /* For expressions with operands, compare their operands recursively. */
2642 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2644 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2645 if (cmp != 0)
2646 return cmp;
2650 return 0;
2654 /* Compare two data-references DRA and DRB to group them into chunks
2655 suitable for grouping. */
2657 static int
2658 dr_group_sort_cmp (const void *dra_, const void *drb_)
2660 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2661 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2662 int cmp;
2664 /* Stabilize sort. */
2665 if (dra == drb)
2666 return 0;
2668 /* DRs in different loops never belong to the same group. */
2669 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2670 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2671 if (loopa != loopb)
2672 return loopa->num < loopb->num ? -1 : 1;
2674 /* Ordering of DRs according to base. */
2675 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2677 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2678 if (cmp != 0)
2679 return cmp;
2682 /* And according to DR_OFFSET. */
2683 if (!dr_equal_offsets_p (dra, drb))
2685 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2686 if (cmp != 0)
2687 return cmp;
2690 /* Put reads before writes. */
2691 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2692 return DR_IS_READ (dra) ? -1 : 1;
2694 /* Then sort after access size. */
2695 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2696 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2698 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2699 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2700 if (cmp != 0)
2701 return cmp;
2704 /* And after step. */
2705 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2707 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2708 if (cmp != 0)
2709 return cmp;
2712 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2713 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2714 if (cmp == 0)
2715 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2716 return cmp;
2719 /* Function vect_analyze_data_ref_accesses.
2721 Analyze the access pattern of all the data references in the loop.
2723 FORNOW: the only access pattern that is considered vectorizable is a
2724 simple step 1 (consecutive) access.
2726 FORNOW: handle only arrays and pointer accesses. */
2728 bool
2729 vect_analyze_data_ref_accesses (vec_info *vinfo)
2731 unsigned int i;
2732 vec<data_reference_p> datarefs = vinfo->datarefs;
2733 struct data_reference *dr;
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_NOTE, vect_location,
2737 "=== vect_analyze_data_ref_accesses ===\n");
2739 if (datarefs.is_empty ())
2740 return true;
2742 /* Sort the array of datarefs to make building the interleaving chains
2743 linear. Don't modify the original vector's order, it is needed for
2744 determining what dependencies are reversed. */
2745 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2746 datarefs_copy.qsort (dr_group_sort_cmp);
2748 /* Build the interleaving chains. */
2749 for (i = 0; i < datarefs_copy.length () - 1;)
2751 data_reference_p dra = datarefs_copy[i];
2752 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2753 stmt_vec_info lastinfo = NULL;
2754 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2756 ++i;
2757 continue;
2759 for (i = i + 1; i < datarefs_copy.length (); ++i)
2761 data_reference_p drb = datarefs_copy[i];
2762 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2763 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2764 break;
2766 /* ??? Imperfect sorting (non-compatible types, non-modulo
2767 accesses, same accesses) can lead to a group to be artificially
2768 split here as we don't just skip over those. If it really
2769 matters we can push those to a worklist and re-iterate
2770 over them. The we can just skip ahead to the next DR here. */
2772 /* DRs in a different loop should not be put into the same
2773 interleaving group. */
2774 if (gimple_bb (DR_STMT (dra))->loop_father
2775 != gimple_bb (DR_STMT (drb))->loop_father)
2776 break;
2778 /* Check that the data-refs have same first location (except init)
2779 and they are both either store or load (not load and store,
2780 not masked loads or stores). */
2781 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2782 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2783 DR_BASE_ADDRESS (drb), 0)
2784 || !dr_equal_offsets_p (dra, drb)
2785 || !gimple_assign_single_p (DR_STMT (dra))
2786 || !gimple_assign_single_p (DR_STMT (drb)))
2787 break;
2789 /* Check that the data-refs have the same constant size. */
2790 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2791 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2792 if (!tree_fits_uhwi_p (sza)
2793 || !tree_fits_uhwi_p (szb)
2794 || !tree_int_cst_equal (sza, szb))
2795 break;
2797 /* Check that the data-refs have the same step. */
2798 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2799 break;
2801 /* Do not place the same access in the interleaving chain twice. */
2802 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2803 break;
2805 /* Check the types are compatible.
2806 ??? We don't distinguish this during sorting. */
2807 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2808 TREE_TYPE (DR_REF (drb))))
2809 break;
2811 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2812 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2813 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2814 gcc_assert (init_a <= init_b);
2816 /* If init_b == init_a + the size of the type * k, we have an
2817 interleaving, and DRA is accessed before DRB. */
2818 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2819 if (type_size_a == 0
2820 || (init_b - init_a) % type_size_a != 0)
2821 break;
2823 /* If we have a store, the accesses are adjacent. This splits
2824 groups into chunks we support (we don't support vectorization
2825 of stores with gaps). */
2826 if (!DR_IS_READ (dra)
2827 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2828 (DR_INIT (datarefs_copy[i-1]))
2829 != type_size_a))
2830 break;
2832 /* If the step (if not zero or non-constant) is greater than the
2833 difference between data-refs' inits this splits groups into
2834 suitable sizes. */
2835 if (tree_fits_shwi_p (DR_STEP (dra)))
2837 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2838 if (step != 0 && step <= (init_b - init_a))
2839 break;
2842 if (dump_enabled_p ())
2844 dump_printf_loc (MSG_NOTE, vect_location,
2845 "Detected interleaving ");
2846 if (DR_IS_READ (dra))
2847 dump_printf (MSG_NOTE, "load ");
2848 else
2849 dump_printf (MSG_NOTE, "store ");
2850 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2851 dump_printf (MSG_NOTE, " and ");
2852 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2853 dump_printf (MSG_NOTE, "\n");
2856 /* Link the found element into the group list. */
2857 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2859 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2860 lastinfo = stmtinfo_a;
2862 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2863 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2864 lastinfo = stmtinfo_b;
2868 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2869 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2870 && !vect_analyze_data_ref_access (dr))
2872 if (dump_enabled_p ())
2873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2874 "not vectorized: complicated access pattern.\n");
2876 if (is_a <bb_vec_info> (vinfo))
2878 /* Mark the statement as not vectorizable. */
2879 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2880 continue;
2882 else
2884 datarefs_copy.release ();
2885 return false;
2889 datarefs_copy.release ();
2890 return true;
2894 /* Operator == between two dr_with_seg_len objects.
2896 This equality operator is used to make sure two data refs
2897 are the same one so that we will consider to combine the
2898 aliasing checks of those two pairs of data dependent data
2899 refs. */
2901 static bool
2902 operator == (const dr_with_seg_len& d1,
2903 const dr_with_seg_len& d2)
2905 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2906 DR_BASE_ADDRESS (d2.dr), 0)
2907 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2908 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2909 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2912 /* Function comp_dr_with_seg_len_pair.
2914 Comparison function for sorting objects of dr_with_seg_len_pair_t
2915 so that we can combine aliasing checks in one scan. */
2917 static int
2918 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2920 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2921 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2922 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2923 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2925 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2926 if a and c have the same basic address snd step, and b and d have the same
2927 address and step. Therefore, if any a&c or b&d don't have the same address
2928 and step, we don't care the order of those two pairs after sorting. */
2929 int comp_res;
2931 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2932 DR_BASE_ADDRESS (b1.dr))) != 0)
2933 return comp_res;
2934 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2935 DR_BASE_ADDRESS (b2.dr))) != 0)
2936 return comp_res;
2937 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2938 return comp_res;
2939 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2940 return comp_res;
2941 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2942 return comp_res;
2943 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2944 return comp_res;
2945 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2946 return comp_res;
2947 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2948 return comp_res;
2950 return 0;
2953 /* Function vect_vfa_segment_size.
2955 Create an expression that computes the size of segment
2956 that will be accessed for a data reference. The functions takes into
2957 account that realignment loads may access one more vector.
2959 Input:
2960 DR: The data reference.
2961 LENGTH_FACTOR: segment length to consider.
2963 Return an expression whose value is the size of segment which will be
2964 accessed by DR. */
2966 static tree
2967 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2969 tree segment_length;
2971 if (integer_zerop (DR_STEP (dr)))
2972 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2973 else
2974 segment_length = size_binop (MULT_EXPR,
2975 fold_convert (sizetype, DR_STEP (dr)),
2976 fold_convert (sizetype, length_factor));
2978 if (vect_supportable_dr_alignment (dr, false)
2979 == dr_explicit_realign_optimized)
2981 tree vector_size = TYPE_SIZE_UNIT
2982 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2984 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2986 return segment_length;
2989 /* Function vect_no_alias_p.
2991 Given data references A and B with equal base and offset, the alias
2992 relation can be decided at compilation time, return TRUE if they do
2993 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2994 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2995 respectively. */
2997 static bool
2998 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2999 tree segment_length_a, tree segment_length_b)
3001 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3002 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3003 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3004 return false;
3006 tree seg_a_min = DR_INIT (a);
3007 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3008 seg_a_min, segment_length_a);
3009 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3010 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3011 [a, a+12) */
3012 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3014 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3015 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3016 seg_a_max, unit_size);
3017 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3018 DR_INIT (a), unit_size);
3020 tree seg_b_min = DR_INIT (b);
3021 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3022 seg_b_min, segment_length_b);
3023 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3025 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3026 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3027 seg_b_max, unit_size);
3028 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3029 DR_INIT (b), unit_size);
3032 if (tree_int_cst_le (seg_a_max, seg_b_min)
3033 || tree_int_cst_le (seg_b_max, seg_a_min))
3034 return true;
3036 return false;
3039 /* Function vect_prune_runtime_alias_test_list.
3041 Prune a list of ddrs to be tested at run-time by versioning for alias.
3042 Merge several alias checks into one if possible.
3043 Return FALSE if resulting list of ddrs is longer then allowed by
3044 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3046 bool
3047 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3049 vec<ddr_p> may_alias_ddrs =
3050 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3051 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3052 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3053 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3054 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3056 ddr_p ddr;
3057 unsigned int i;
3058 tree length_factor;
3060 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_NOTE, vect_location,
3062 "=== vect_prune_runtime_alias_test_list ===\n");
3064 if (may_alias_ddrs.is_empty ())
3065 return true;
3067 /* Basically, for each pair of dependent data refs store_ptr_0
3068 and load_ptr_0, we create an expression:
3070 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3071 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3073 for aliasing checks. However, in some cases we can decrease
3074 the number of checks by combining two checks into one. For
3075 example, suppose we have another pair of data refs store_ptr_0
3076 and load_ptr_1, and if the following condition is satisfied:
3078 load_ptr_0 < load_ptr_1 &&
3079 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3081 (this condition means, in each iteration of vectorized loop,
3082 the accessed memory of store_ptr_0 cannot be between the memory
3083 of load_ptr_0 and load_ptr_1.)
3085 we then can use only the following expression to finish the
3086 alising checks between store_ptr_0 & load_ptr_0 and
3087 store_ptr_0 & load_ptr_1:
3089 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3090 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3092 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3093 same basic address. */
3095 comp_alias_ddrs.create (may_alias_ddrs.length ());
3097 /* First, we collect all data ref pairs for aliasing checks. */
3098 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3100 int comp_res;
3101 struct data_reference *dr_a, *dr_b;
3102 gimple *dr_group_first_a, *dr_group_first_b;
3103 tree segment_length_a, segment_length_b;
3104 gimple *stmt_a, *stmt_b;
3106 dr_a = DDR_A (ddr);
3107 stmt_a = DR_STMT (DDR_A (ddr));
3108 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3109 if (dr_group_first_a)
3111 stmt_a = dr_group_first_a;
3112 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3115 dr_b = DDR_B (ddr);
3116 stmt_b = DR_STMT (DDR_B (ddr));
3117 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3118 if (dr_group_first_b)
3120 stmt_b = dr_group_first_b;
3121 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3124 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3125 length_factor = scalar_loop_iters;
3126 else
3127 length_factor = size_int (vect_factor);
3128 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3129 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3131 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3132 if (comp_res == 0)
3133 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3135 /* Alias is known at compilation time. */
3136 if (comp_res == 0
3137 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3138 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3139 && TREE_CODE (segment_length_a) == INTEGER_CST
3140 && TREE_CODE (segment_length_b) == INTEGER_CST)
3142 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3143 continue;
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_NOTE, vect_location,
3147 "not vectorized: compilation time alias.\n");
3149 return false;
3152 dr_with_seg_len_pair_t dr_with_seg_len_pair
3153 (dr_with_seg_len (dr_a, segment_length_a),
3154 dr_with_seg_len (dr_b, segment_length_b));
3156 /* Canonicalize pairs by sorting the two DR members. */
3157 if (comp_res > 0)
3158 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3160 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3163 /* Second, we sort the collected data ref pairs so that we can scan
3164 them once to combine all possible aliasing checks. */
3165 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3167 /* Third, we scan the sorted dr pairs and check if we can combine
3168 alias checks of two neighboring dr pairs. */
3169 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3171 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3172 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3173 *dr_b1 = &comp_alias_ddrs[i-1].second,
3174 *dr_a2 = &comp_alias_ddrs[i].first,
3175 *dr_b2 = &comp_alias_ddrs[i].second;
3177 /* Remove duplicate data ref pairs. */
3178 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3180 if (dump_enabled_p ())
3182 dump_printf_loc (MSG_NOTE, vect_location,
3183 "found equal ranges ");
3184 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3185 DR_REF (dr_a1->dr));
3186 dump_printf (MSG_NOTE, ", ");
3187 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3188 DR_REF (dr_b1->dr));
3189 dump_printf (MSG_NOTE, " and ");
3190 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3191 DR_REF (dr_a2->dr));
3192 dump_printf (MSG_NOTE, ", ");
3193 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3194 DR_REF (dr_b2->dr));
3195 dump_printf (MSG_NOTE, "\n");
3198 comp_alias_ddrs.ordered_remove (i--);
3199 continue;
3202 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3204 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3205 and DR_A1 and DR_A2 are two consecutive memrefs. */
3206 if (*dr_a1 == *dr_a2)
3208 std::swap (dr_a1, dr_b1);
3209 std::swap (dr_a2, dr_b2);
3212 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3213 DR_BASE_ADDRESS (dr_a2->dr), 0)
3214 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3215 DR_OFFSET (dr_a2->dr), 0)
3216 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3217 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3218 continue;
3220 /* Make sure dr_a1 starts left of dr_a2. */
3221 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3222 std::swap (*dr_a1, *dr_a2);
3224 bool do_remove = false;
3225 unsigned HOST_WIDE_INT diff
3226 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3227 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3229 /* If the left segment does not extend beyond the start of the
3230 right segment the new segment length is that of the right
3231 plus the segment distance. */
3232 if (tree_fits_uhwi_p (dr_a1->seg_len)
3233 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3235 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3236 size_int (diff));
3237 do_remove = true;
3239 /* Generally the new segment length is the maximum of the
3240 left segment size and the right segment size plus the distance.
3241 ??? We can also build tree MAX_EXPR here but it's not clear this
3242 is profitable. */
3243 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3244 && tree_fits_uhwi_p (dr_a2->seg_len))
3246 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3247 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3248 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3249 do_remove = true;
3251 /* Now we check if the following condition is satisfied:
3253 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3255 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3256 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3257 have to make a best estimation. We can get the minimum value
3258 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3259 then either of the following two conditions can guarantee the
3260 one above:
3262 1: DIFF <= MIN_SEG_LEN_B
3263 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3264 else
3266 unsigned HOST_WIDE_INT min_seg_len_b
3267 = (tree_fits_uhwi_p (dr_b1->seg_len)
3268 ? tree_to_uhwi (dr_b1->seg_len)
3269 : vect_factor);
3271 if (diff <= min_seg_len_b
3272 || (tree_fits_uhwi_p (dr_a1->seg_len)
3273 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3275 dr_a1->seg_len = size_binop (PLUS_EXPR,
3276 dr_a2->seg_len, size_int (diff));
3277 do_remove = true;
3281 if (do_remove)
3283 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_NOTE, vect_location,
3286 "merging ranges for ");
3287 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3288 dump_printf (MSG_NOTE, ", ");
3289 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3290 dump_printf (MSG_NOTE, " and ");
3291 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3292 dump_printf (MSG_NOTE, ", ");
3293 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3294 dump_printf (MSG_NOTE, "\n");
3296 comp_alias_ddrs.ordered_remove (i--);
3301 dump_printf_loc (MSG_NOTE, vect_location,
3302 "improved number of alias checks from %d to %d\n",
3303 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3304 if ((int) comp_alias_ddrs.length () >
3305 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3307 if (dump_enabled_p ())
3308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3309 "number of versioning for alias "
3310 "run-time tests exceeds %d "
3311 "(--param vect-max-version-for-alias-checks)\n",
3312 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3313 return false;
3316 /* All alias checks have been resolved at compilation time. */
3317 if (!comp_alias_ddrs.length ())
3318 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3320 return true;
3323 /* Return true if a non-affine read or write in STMT is suitable for a
3324 gather load or scatter store. Describe the operation in *INFO if so. */
3326 bool
3327 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3328 gather_scatter_info *info)
3330 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3331 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3332 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3333 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3334 tree offtype = NULL_TREE;
3335 tree decl, base, off;
3336 machine_mode pmode;
3337 int punsignedp, reversep, pvolatilep = 0;
3339 base = DR_REF (dr);
3340 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3341 see if we can use the def stmt of the address. */
3342 if (is_gimple_call (stmt)
3343 && gimple_call_internal_p (stmt)
3344 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3345 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3346 && TREE_CODE (base) == MEM_REF
3347 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3348 && integer_zerop (TREE_OPERAND (base, 1))
3349 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3351 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3352 if (is_gimple_assign (def_stmt)
3353 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3354 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3357 /* The gather and scatter builtins need address of the form
3358 loop_invariant + vector * {1, 2, 4, 8}
3360 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3361 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3362 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3363 multiplications and additions in it. To get a vector, we need
3364 a single SSA_NAME that will be defined in the loop and will
3365 contain everything that is not loop invariant and that can be
3366 vectorized. The following code attempts to find such a preexistng
3367 SSA_NAME OFF and put the loop invariants into a tree BASE
3368 that can be gimplified before the loop. */
3369 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3370 &punsignedp, &reversep, &pvolatilep);
3371 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3373 if (TREE_CODE (base) == MEM_REF)
3375 if (!integer_zerop (TREE_OPERAND (base, 1)))
3377 if (off == NULL_TREE)
3379 offset_int moff = mem_ref_offset (base);
3380 off = wide_int_to_tree (sizetype, moff);
3382 else
3383 off = size_binop (PLUS_EXPR, off,
3384 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3386 base = TREE_OPERAND (base, 0);
3388 else
3389 base = build_fold_addr_expr (base);
3391 if (off == NULL_TREE)
3392 off = size_zero_node;
3394 /* If base is not loop invariant, either off is 0, then we start with just
3395 the constant offset in the loop invariant BASE and continue with base
3396 as OFF, otherwise give up.
3397 We could handle that case by gimplifying the addition of base + off
3398 into some SSA_NAME and use that as off, but for now punt. */
3399 if (!expr_invariant_in_loop_p (loop, base))
3401 if (!integer_zerop (off))
3402 return false;
3403 off = base;
3404 base = size_int (pbitpos / BITS_PER_UNIT);
3406 /* Otherwise put base + constant offset into the loop invariant BASE
3407 and continue with OFF. */
3408 else
3410 base = fold_convert (sizetype, base);
3411 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3414 /* OFF at this point may be either a SSA_NAME or some tree expression
3415 from get_inner_reference. Try to peel off loop invariants from it
3416 into BASE as long as possible. */
3417 STRIP_NOPS (off);
3418 while (offtype == NULL_TREE)
3420 enum tree_code code;
3421 tree op0, op1, add = NULL_TREE;
3423 if (TREE_CODE (off) == SSA_NAME)
3425 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3427 if (expr_invariant_in_loop_p (loop, off))
3428 return false;
3430 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3431 break;
3433 op0 = gimple_assign_rhs1 (def_stmt);
3434 code = gimple_assign_rhs_code (def_stmt);
3435 op1 = gimple_assign_rhs2 (def_stmt);
3437 else
3439 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3440 return false;
3441 code = TREE_CODE (off);
3442 extract_ops_from_tree (off, &code, &op0, &op1);
3444 switch (code)
3446 case POINTER_PLUS_EXPR:
3447 case PLUS_EXPR:
3448 if (expr_invariant_in_loop_p (loop, op0))
3450 add = op0;
3451 off = op1;
3452 do_add:
3453 add = fold_convert (sizetype, add);
3454 if (scale != 1)
3455 add = size_binop (MULT_EXPR, add, size_int (scale));
3456 base = size_binop (PLUS_EXPR, base, add);
3457 continue;
3459 if (expr_invariant_in_loop_p (loop, op1))
3461 add = op1;
3462 off = op0;
3463 goto do_add;
3465 break;
3466 case MINUS_EXPR:
3467 if (expr_invariant_in_loop_p (loop, op1))
3469 add = fold_convert (sizetype, op1);
3470 add = size_binop (MINUS_EXPR, size_zero_node, add);
3471 off = op0;
3472 goto do_add;
3474 break;
3475 case MULT_EXPR:
3476 if (scale == 1 && tree_fits_shwi_p (op1))
3478 scale = tree_to_shwi (op1);
3479 off = op0;
3480 continue;
3482 break;
3483 case SSA_NAME:
3484 off = op0;
3485 continue;
3486 CASE_CONVERT:
3487 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3488 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3489 break;
3490 if (TYPE_PRECISION (TREE_TYPE (op0))
3491 == TYPE_PRECISION (TREE_TYPE (off)))
3493 off = op0;
3494 continue;
3496 if (TYPE_PRECISION (TREE_TYPE (op0))
3497 < TYPE_PRECISION (TREE_TYPE (off)))
3499 off = op0;
3500 offtype = TREE_TYPE (off);
3501 STRIP_NOPS (off);
3502 continue;
3504 break;
3505 default:
3506 break;
3508 break;
3511 /* If at the end OFF still isn't a SSA_NAME or isn't
3512 defined in the loop, punt. */
3513 if (TREE_CODE (off) != SSA_NAME
3514 || expr_invariant_in_loop_p (loop, off))
3515 return false;
3517 if (offtype == NULL_TREE)
3518 offtype = TREE_TYPE (off);
3520 if (DR_IS_READ (dr))
3521 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3522 offtype, scale);
3523 else
3524 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3525 offtype, scale);
3527 if (decl == NULL_TREE)
3528 return false;
3530 info->decl = decl;
3531 info->base = base;
3532 info->offset = off;
3533 info->offset_dt = vect_unknown_def_type;
3534 info->offset_vectype = NULL_TREE;
3535 info->scale = scale;
3536 return true;
3539 /* Function vect_analyze_data_refs.
3541 Find all the data references in the loop or basic block.
3543 The general structure of the analysis of data refs in the vectorizer is as
3544 follows:
3545 1- vect_analyze_data_refs(loop/bb): call
3546 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3547 in the loop/bb and their dependences.
3548 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3549 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3550 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3554 bool
3555 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3557 struct loop *loop = NULL;
3558 unsigned int i;
3559 struct data_reference *dr;
3560 tree scalar_type;
3562 if (dump_enabled_p ())
3563 dump_printf_loc (MSG_NOTE, vect_location,
3564 "=== vect_analyze_data_refs ===\n");
3566 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3567 loop = LOOP_VINFO_LOOP (loop_vinfo);
3569 /* Go through the data-refs, check that the analysis succeeded. Update
3570 pointer from stmt_vec_info struct to DR and vectype. */
3572 vec<data_reference_p> datarefs = vinfo->datarefs;
3573 FOR_EACH_VEC_ELT (datarefs, i, dr)
3575 gimple *stmt;
3576 stmt_vec_info stmt_info;
3577 tree base, offset, init;
3578 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3579 bool simd_lane_access = false;
3580 int vf;
3582 again:
3583 if (!dr || !DR_REF (dr))
3585 if (dump_enabled_p ())
3586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3587 "not vectorized: unhandled data-ref\n");
3588 return false;
3591 stmt = DR_STMT (dr);
3592 stmt_info = vinfo_for_stmt (stmt);
3594 /* Discard clobbers from the dataref vector. We will remove
3595 clobber stmts during vectorization. */
3596 if (gimple_clobber_p (stmt))
3598 free_data_ref (dr);
3599 if (i == datarefs.length () - 1)
3601 datarefs.pop ();
3602 break;
3604 datarefs.ordered_remove (i);
3605 dr = datarefs[i];
3606 goto again;
3609 /* Check that analysis of the data-ref succeeded. */
3610 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3611 || !DR_STEP (dr))
3613 bool maybe_gather
3614 = DR_IS_READ (dr)
3615 && !TREE_THIS_VOLATILE (DR_REF (dr))
3616 && targetm.vectorize.builtin_gather != NULL;
3617 bool maybe_scatter
3618 = DR_IS_WRITE (dr)
3619 && !TREE_THIS_VOLATILE (DR_REF (dr))
3620 && targetm.vectorize.builtin_scatter != NULL;
3621 bool maybe_simd_lane_access
3622 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3624 /* If target supports vector gather loads or scatter stores, or if
3625 this might be a SIMD lane access, see if they can't be used. */
3626 if (is_a <loop_vec_info> (vinfo)
3627 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3628 && !nested_in_vect_loop_p (loop, stmt))
3630 struct data_reference *newdr
3631 = create_data_ref (NULL, loop_containing_stmt (stmt),
3632 DR_REF (dr), stmt, maybe_scatter ? false : true);
3633 gcc_assert (newdr != NULL && DR_REF (newdr));
3634 if (DR_BASE_ADDRESS (newdr)
3635 && DR_OFFSET (newdr)
3636 && DR_INIT (newdr)
3637 && DR_STEP (newdr)
3638 && integer_zerop (DR_STEP (newdr)))
3640 if (maybe_simd_lane_access)
3642 tree off = DR_OFFSET (newdr);
3643 STRIP_NOPS (off);
3644 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3645 && TREE_CODE (off) == MULT_EXPR
3646 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3648 tree step = TREE_OPERAND (off, 1);
3649 off = TREE_OPERAND (off, 0);
3650 STRIP_NOPS (off);
3651 if (CONVERT_EXPR_P (off)
3652 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3653 0)))
3654 < TYPE_PRECISION (TREE_TYPE (off)))
3655 off = TREE_OPERAND (off, 0);
3656 if (TREE_CODE (off) == SSA_NAME)
3658 gimple *def = SSA_NAME_DEF_STMT (off);
3659 tree reft = TREE_TYPE (DR_REF (newdr));
3660 if (is_gimple_call (def)
3661 && gimple_call_internal_p (def)
3662 && (gimple_call_internal_fn (def)
3663 == IFN_GOMP_SIMD_LANE))
3665 tree arg = gimple_call_arg (def, 0);
3666 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3667 arg = SSA_NAME_VAR (arg);
3668 if (arg == loop->simduid
3669 /* For now. */
3670 && tree_int_cst_equal
3671 (TYPE_SIZE_UNIT (reft),
3672 step))
3674 DR_OFFSET (newdr) = ssize_int (0);
3675 DR_STEP (newdr) = step;
3676 DR_ALIGNED_TO (newdr)
3677 = size_int (BIGGEST_ALIGNMENT);
3678 dr = newdr;
3679 simd_lane_access = true;
3685 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3687 dr = newdr;
3688 if (maybe_gather)
3689 gatherscatter = GATHER;
3690 else
3691 gatherscatter = SCATTER;
3694 if (gatherscatter == SG_NONE && !simd_lane_access)
3695 free_data_ref (newdr);
3698 if (gatherscatter == SG_NONE && !simd_lane_access)
3700 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3703 "not vectorized: data ref analysis "
3704 "failed ");
3705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3708 if (is_a <bb_vec_info> (vinfo))
3709 break;
3711 return false;
3715 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3717 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3719 "not vectorized: base addr of dr is a "
3720 "constant\n");
3722 if (is_a <bb_vec_info> (vinfo))
3723 break;
3725 if (gatherscatter != SG_NONE || simd_lane_access)
3726 free_data_ref (dr);
3727 return false;
3730 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3732 if (dump_enabled_p ())
3734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3735 "not vectorized: volatile type ");
3736 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3739 if (is_a <bb_vec_info> (vinfo))
3740 break;
3742 return false;
3745 if (stmt_can_throw_internal (stmt))
3747 if (dump_enabled_p ())
3749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750 "not vectorized: statement can throw an "
3751 "exception ");
3752 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3755 if (is_a <bb_vec_info> (vinfo))
3756 break;
3758 if (gatherscatter != SG_NONE || simd_lane_access)
3759 free_data_ref (dr);
3760 return false;
3763 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3764 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3766 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769 "not vectorized: statement is bitfield "
3770 "access ");
3771 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3774 if (is_a <bb_vec_info> (vinfo))
3775 break;
3777 if (gatherscatter != SG_NONE || simd_lane_access)
3778 free_data_ref (dr);
3779 return false;
3782 base = unshare_expr (DR_BASE_ADDRESS (dr));
3783 offset = unshare_expr (DR_OFFSET (dr));
3784 init = unshare_expr (DR_INIT (dr));
3786 if (is_gimple_call (stmt)
3787 && (!gimple_call_internal_p (stmt)
3788 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3789 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3791 if (dump_enabled_p ())
3793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3794 "not vectorized: dr in a call ");
3795 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3798 if (is_a <bb_vec_info> (vinfo))
3799 break;
3801 if (gatherscatter != SG_NONE || simd_lane_access)
3802 free_data_ref (dr);
3803 return false;
3806 /* Update DR field in stmt_vec_info struct. */
3808 /* If the dataref is in an inner-loop of the loop that is considered for
3809 for vectorization, we also want to analyze the access relative to
3810 the outer-loop (DR contains information only relative to the
3811 inner-most enclosing loop). We do that by building a reference to the
3812 first location accessed by the inner-loop, and analyze it relative to
3813 the outer-loop. */
3814 if (loop && nested_in_vect_loop_p (loop, stmt))
3816 tree outer_step, outer_base, outer_init;
3817 HOST_WIDE_INT pbitsize, pbitpos;
3818 tree poffset;
3819 machine_mode pmode;
3820 int punsignedp, preversep, pvolatilep;
3821 affine_iv base_iv, offset_iv;
3822 tree dinit;
3824 /* Build a reference to the first location accessed by the
3825 inner-loop: *(BASE+INIT). (The first location is actually
3826 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3827 tree inner_base = build_fold_indirect_ref
3828 (fold_build_pointer_plus (base, init));
3830 if (dump_enabled_p ())
3832 dump_printf_loc (MSG_NOTE, vect_location,
3833 "analyze in outer-loop: ");
3834 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3835 dump_printf (MSG_NOTE, "\n");
3838 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3839 &poffset, &pmode, &punsignedp,
3840 &preversep, &pvolatilep);
3841 gcc_assert (outer_base != NULL_TREE);
3843 if (pbitpos % BITS_PER_UNIT != 0)
3845 if (dump_enabled_p ())
3846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3847 "failed: bit offset alignment.\n");
3848 return false;
3851 if (preversep)
3853 if (dump_enabled_p ())
3854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3855 "failed: reverse storage order.\n");
3856 return false;
3859 outer_base = build_fold_addr_expr (outer_base);
3860 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3861 &base_iv, false))
3863 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3865 "failed: evolution of base is not affine.\n");
3866 return false;
3869 if (offset)
3871 if (poffset)
3872 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3873 poffset);
3874 else
3875 poffset = offset;
3878 if (!poffset)
3880 offset_iv.base = ssize_int (0);
3881 offset_iv.step = ssize_int (0);
3883 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3884 &offset_iv, false))
3886 if (dump_enabled_p ())
3887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3888 "evolution of offset is not affine.\n");
3889 return false;
3892 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3893 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3894 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3895 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3896 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3898 outer_step = size_binop (PLUS_EXPR,
3899 fold_convert (ssizetype, base_iv.step),
3900 fold_convert (ssizetype, offset_iv.step));
3902 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3903 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3904 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3905 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3906 STMT_VINFO_DR_OFFSET (stmt_info) =
3907 fold_convert (ssizetype, offset_iv.base);
3908 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3909 size_int (highest_pow2_factor (offset_iv.base));
3911 if (dump_enabled_p ())
3913 dump_printf_loc (MSG_NOTE, vect_location,
3914 "\touter base_address: ");
3915 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3916 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3917 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3918 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3919 STMT_VINFO_DR_OFFSET (stmt_info));
3920 dump_printf (MSG_NOTE,
3921 "\n\touter constant offset from base address: ");
3922 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3923 STMT_VINFO_DR_INIT (stmt_info));
3924 dump_printf (MSG_NOTE, "\n\touter step: ");
3925 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3926 STMT_VINFO_DR_STEP (stmt_info));
3927 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3928 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3929 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3930 dump_printf (MSG_NOTE, "\n");
3934 if (STMT_VINFO_DATA_REF (stmt_info))
3936 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3939 "not vectorized: more than one data ref "
3940 "in stmt: ");
3941 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3944 if (is_a <bb_vec_info> (vinfo))
3945 break;
3947 if (gatherscatter != SG_NONE || simd_lane_access)
3948 free_data_ref (dr);
3949 return false;
3952 STMT_VINFO_DATA_REF (stmt_info) = dr;
3953 if (simd_lane_access)
3955 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3956 free_data_ref (datarefs[i]);
3957 datarefs[i] = dr;
3960 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3961 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3962 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3964 if (dump_enabled_p ())
3966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3967 "not vectorized: base object not addressable "
3968 "for stmt: ");
3969 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3971 if (is_a <bb_vec_info> (vinfo))
3973 /* In BB vectorization the ref can still participate
3974 in dependence analysis, we just can't vectorize it. */
3975 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3976 continue;
3978 return false;
3981 /* Set vectype for STMT. */
3982 scalar_type = TREE_TYPE (DR_REF (dr));
3983 STMT_VINFO_VECTYPE (stmt_info)
3984 = get_vectype_for_scalar_type (scalar_type);
3985 if (!STMT_VINFO_VECTYPE (stmt_info))
3987 if (dump_enabled_p ())
3989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3990 "not vectorized: no vectype for stmt: ");
3991 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3992 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3993 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3994 scalar_type);
3995 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3998 if (is_a <bb_vec_info> (vinfo))
4000 /* No vector type is fine, the ref can still participate
4001 in dependence analysis, we just can't vectorize it. */
4002 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4003 continue;
4006 if (gatherscatter != SG_NONE || simd_lane_access)
4008 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4009 if (gatherscatter != SG_NONE)
4010 free_data_ref (dr);
4012 return false;
4014 else
4016 if (dump_enabled_p ())
4018 dump_printf_loc (MSG_NOTE, vect_location,
4019 "got vectype for stmt: ");
4020 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4021 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4022 STMT_VINFO_VECTYPE (stmt_info));
4023 dump_printf (MSG_NOTE, "\n");
4027 /* Adjust the minimal vectorization factor according to the
4028 vector type. */
4029 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4030 if (vf > *min_vf)
4031 *min_vf = vf;
4033 if (gatherscatter != SG_NONE)
4035 gather_scatter_info gs_info;
4036 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4037 &gs_info)
4038 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4040 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4041 free_data_ref (dr);
4042 if (dump_enabled_p ())
4044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4045 (gatherscatter == GATHER) ?
4046 "not vectorized: not suitable for gather "
4047 "load " :
4048 "not vectorized: not suitable for scatter "
4049 "store ");
4050 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4052 return false;
4055 free_data_ref (datarefs[i]);
4056 datarefs[i] = dr;
4057 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4060 else if (is_a <loop_vec_info> (vinfo)
4061 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4063 if (nested_in_vect_loop_p (loop, stmt))
4065 if (dump_enabled_p ())
4067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4068 "not vectorized: not suitable for strided "
4069 "load ");
4070 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4072 return false;
4074 STMT_VINFO_STRIDED_P (stmt_info) = true;
4078 /* If we stopped analysis at the first dataref we could not analyze
4079 when trying to vectorize a basic-block mark the rest of the datarefs
4080 as not vectorizable and truncate the vector of datarefs. That
4081 avoids spending useless time in analyzing their dependence. */
4082 if (i != datarefs.length ())
4084 gcc_assert (is_a <bb_vec_info> (vinfo));
4085 for (unsigned j = i; j < datarefs.length (); ++j)
4087 data_reference_p dr = datarefs[j];
4088 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4089 free_data_ref (dr);
4091 datarefs.truncate (i);
4094 return true;
4098 /* Function vect_get_new_vect_var.
4100 Returns a name for a new variable. The current naming scheme appends the
4101 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4102 the name of vectorizer generated variables, and appends that to NAME if
4103 provided. */
4105 tree
4106 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4108 const char *prefix;
4109 tree new_vect_var;
4111 switch (var_kind)
4113 case vect_simple_var:
4114 prefix = "vect";
4115 break;
4116 case vect_scalar_var:
4117 prefix = "stmp";
4118 break;
4119 case vect_mask_var:
4120 prefix = "mask";
4121 break;
4122 case vect_pointer_var:
4123 prefix = "vectp";
4124 break;
4125 default:
4126 gcc_unreachable ();
4129 if (name)
4131 char* tmp = concat (prefix, "_", name, NULL);
4132 new_vect_var = create_tmp_reg (type, tmp);
4133 free (tmp);
4135 else
4136 new_vect_var = create_tmp_reg (type, prefix);
4138 return new_vect_var;
4141 /* Like vect_get_new_vect_var but return an SSA name. */
4143 tree
4144 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4146 const char *prefix;
4147 tree new_vect_var;
4149 switch (var_kind)
4151 case vect_simple_var:
4152 prefix = "vect";
4153 break;
4154 case vect_scalar_var:
4155 prefix = "stmp";
4156 break;
4157 case vect_pointer_var:
4158 prefix = "vectp";
4159 break;
4160 default:
4161 gcc_unreachable ();
4164 if (name)
4166 char* tmp = concat (prefix, "_", name, NULL);
4167 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4168 free (tmp);
4170 else
4171 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4173 return new_vect_var;
4176 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4178 static void
4179 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4180 stmt_vec_info stmt_info)
4182 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4183 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4184 int misalign = DR_MISALIGNMENT (dr);
4185 if (misalign == -1)
4186 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4187 else
4188 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4191 /* Function vect_create_addr_base_for_vector_ref.
4193 Create an expression that computes the address of the first memory location
4194 that will be accessed for a data reference.
4196 Input:
4197 STMT: The statement containing the data reference.
4198 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4199 OFFSET: Optional. If supplied, it is be added to the initial address.
4200 LOOP: Specify relative to which loop-nest should the address be computed.
4201 For example, when the dataref is in an inner-loop nested in an
4202 outer-loop that is now being vectorized, LOOP can be either the
4203 outer-loop, or the inner-loop. The first memory location accessed
4204 by the following dataref ('in' points to short):
4206 for (i=0; i<N; i++)
4207 for (j=0; j<M; j++)
4208 s += in[i+j]
4210 is as follows:
4211 if LOOP=i_loop: &in (relative to i_loop)
4212 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4213 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4214 initial address. Unlike OFFSET, which is number of elements to
4215 be added, BYTE_OFFSET is measured in bytes.
4217 Output:
4218 1. Return an SSA_NAME whose value is the address of the memory location of
4219 the first vector of the data reference.
4220 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4221 these statement(s) which define the returned SSA_NAME.
4223 FORNOW: We are only handling array accesses with step 1. */
4225 tree
4226 vect_create_addr_base_for_vector_ref (gimple *stmt,
4227 gimple_seq *new_stmt_list,
4228 tree offset,
4229 struct loop *loop,
4230 tree byte_offset)
4232 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4233 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4234 tree data_ref_base;
4235 const char *base_name;
4236 tree addr_base;
4237 tree dest;
4238 gimple_seq seq = NULL;
4239 tree base_offset;
4240 tree init;
4241 tree vect_ptr_type;
4242 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4243 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4245 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4247 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4249 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4251 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4252 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4253 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4255 else
4257 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4258 base_offset = unshare_expr (DR_OFFSET (dr));
4259 init = unshare_expr (DR_INIT (dr));
4262 if (loop_vinfo)
4263 base_name = get_name (data_ref_base);
4264 else
4266 base_offset = ssize_int (0);
4267 init = ssize_int (0);
4268 base_name = get_name (DR_REF (dr));
4271 /* Create base_offset */
4272 base_offset = size_binop (PLUS_EXPR,
4273 fold_convert (sizetype, base_offset),
4274 fold_convert (sizetype, init));
4276 if (offset)
4278 offset = fold_build2 (MULT_EXPR, sizetype,
4279 fold_convert (sizetype, offset), step);
4280 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4281 base_offset, offset);
4283 if (byte_offset)
4285 byte_offset = fold_convert (sizetype, byte_offset);
4286 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4287 base_offset, byte_offset);
4290 /* base + base_offset */
4291 if (loop_vinfo)
4292 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4293 else
4295 addr_base = build1 (ADDR_EXPR,
4296 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4297 unshare_expr (DR_REF (dr)));
4300 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4301 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4302 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4303 gimple_seq_add_seq (new_stmt_list, seq);
4305 if (DR_PTR_INFO (dr)
4306 && TREE_CODE (addr_base) == SSA_NAME
4307 && !SSA_NAME_PTR_INFO (addr_base))
4309 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4310 if (offset || byte_offset)
4311 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4314 if (dump_enabled_p ())
4316 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4317 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4318 dump_printf (MSG_NOTE, "\n");
4321 return addr_base;
4325 /* Function vect_create_data_ref_ptr.
4327 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4328 location accessed in the loop by STMT, along with the def-use update
4329 chain to appropriately advance the pointer through the loop iterations.
4330 Also set aliasing information for the pointer. This pointer is used by
4331 the callers to this function to create a memory reference expression for
4332 vector load/store access.
4334 Input:
4335 1. STMT: a stmt that references memory. Expected to be of the form
4336 GIMPLE_ASSIGN <name, data-ref> or
4337 GIMPLE_ASSIGN <data-ref, name>.
4338 2. AGGR_TYPE: the type of the reference, which should be either a vector
4339 or an array.
4340 3. AT_LOOP: the loop where the vector memref is to be created.
4341 4. OFFSET (optional): an offset to be added to the initial address accessed
4342 by the data-ref in STMT.
4343 5. BSI: location where the new stmts are to be placed if there is no loop
4344 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4345 pointing to the initial address.
4346 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4347 to the initial address accessed by the data-ref in STMT. This is
4348 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4349 in bytes.
4351 Output:
4352 1. Declare a new ptr to vector_type, and have it point to the base of the
4353 data reference (initial addressed accessed by the data reference).
4354 For example, for vector of type V8HI, the following code is generated:
4356 v8hi *ap;
4357 ap = (v8hi *)initial_address;
4359 if OFFSET is not supplied:
4360 initial_address = &a[init];
4361 if OFFSET is supplied:
4362 initial_address = &a[init + OFFSET];
4363 if BYTE_OFFSET is supplied:
4364 initial_address = &a[init] + BYTE_OFFSET;
4366 Return the initial_address in INITIAL_ADDRESS.
4368 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4369 update the pointer in each iteration of the loop.
4371 Return the increment stmt that updates the pointer in PTR_INCR.
4373 3. Set INV_P to true if the access pattern of the data reference in the
4374 vectorized loop is invariant. Set it to false otherwise.
4376 4. Return the pointer. */
4378 tree
4379 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4380 tree offset, tree *initial_address,
4381 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4382 bool only_init, bool *inv_p, tree byte_offset)
4384 const char *base_name;
4385 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4386 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4387 struct loop *loop = NULL;
4388 bool nested_in_vect_loop = false;
4389 struct loop *containing_loop = NULL;
4390 tree aggr_ptr_type;
4391 tree aggr_ptr;
4392 tree new_temp;
4393 gimple_seq new_stmt_list = NULL;
4394 edge pe = NULL;
4395 basic_block new_bb;
4396 tree aggr_ptr_init;
4397 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4398 tree aptr;
4399 gimple_stmt_iterator incr_gsi;
4400 bool insert_after;
4401 tree indx_before_incr, indx_after_incr;
4402 gimple *incr;
4403 tree step;
4404 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4406 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4407 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4409 if (loop_vinfo)
4411 loop = LOOP_VINFO_LOOP (loop_vinfo);
4412 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4413 containing_loop = (gimple_bb (stmt))->loop_father;
4414 pe = loop_preheader_edge (loop);
4416 else
4418 gcc_assert (bb_vinfo);
4419 only_init = true;
4420 *ptr_incr = NULL;
4423 /* Check the step (evolution) of the load in LOOP, and record
4424 whether it's invariant. */
4425 if (nested_in_vect_loop)
4426 step = STMT_VINFO_DR_STEP (stmt_info);
4427 else
4428 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4430 if (integer_zerop (step))
4431 *inv_p = true;
4432 else
4433 *inv_p = false;
4435 /* Create an expression for the first address accessed by this load
4436 in LOOP. */
4437 base_name = get_name (DR_BASE_ADDRESS (dr));
4439 if (dump_enabled_p ())
4441 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4442 dump_printf_loc (MSG_NOTE, vect_location,
4443 "create %s-pointer variable to type: ",
4444 get_tree_code_name (TREE_CODE (aggr_type)));
4445 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4446 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4447 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4448 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4449 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4450 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4451 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4452 else
4453 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4454 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4455 dump_printf (MSG_NOTE, "\n");
4458 /* (1) Create the new aggregate-pointer variable.
4459 Vector and array types inherit the alias set of their component
4460 type by default so we need to use a ref-all pointer if the data
4461 reference does not conflict with the created aggregated data
4462 reference because it is not addressable. */
4463 bool need_ref_all = false;
4464 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4465 get_alias_set (DR_REF (dr))))
4466 need_ref_all = true;
4467 /* Likewise for any of the data references in the stmt group. */
4468 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4470 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4473 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4474 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4475 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4476 get_alias_set (DR_REF (sdr))))
4478 need_ref_all = true;
4479 break;
4481 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4483 while (orig_stmt);
4485 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4486 need_ref_all);
4487 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4490 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4491 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4492 def-use update cycles for the pointer: one relative to the outer-loop
4493 (LOOP), which is what steps (3) and (4) below do. The other is relative
4494 to the inner-loop (which is the inner-most loop containing the dataref),
4495 and this is done be step (5) below.
4497 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4498 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4499 redundant. Steps (3),(4) create the following:
4501 vp0 = &base_addr;
4502 LOOP: vp1 = phi(vp0,vp2)
4505 vp2 = vp1 + step
4506 goto LOOP
4508 If there is an inner-loop nested in loop, then step (5) will also be
4509 applied, and an additional update in the inner-loop will be created:
4511 vp0 = &base_addr;
4512 LOOP: vp1 = phi(vp0,vp2)
4514 inner: vp3 = phi(vp1,vp4)
4515 vp4 = vp3 + inner_step
4516 if () goto inner
4518 vp2 = vp1 + step
4519 if () goto LOOP */
4521 /* (2) Calculate the initial address of the aggregate-pointer, and set
4522 the aggregate-pointer to point to it before the loop. */
4524 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4526 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4527 offset, loop, byte_offset);
4528 if (new_stmt_list)
4530 if (pe)
4532 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4533 gcc_assert (!new_bb);
4535 else
4536 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4539 *initial_address = new_temp;
4540 aggr_ptr_init = new_temp;
4542 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4543 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4544 inner-loop nested in LOOP (during outer-loop vectorization). */
4546 /* No update in loop is required. */
4547 if (only_init && (!loop_vinfo || at_loop == loop))
4548 aptr = aggr_ptr_init;
4549 else
4551 /* The step of the aggregate pointer is the type size. */
4552 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4553 /* One exception to the above is when the scalar step of the load in
4554 LOOP is zero. In this case the step here is also zero. */
4555 if (*inv_p)
4556 iv_step = size_zero_node;
4557 else if (tree_int_cst_sgn (step) == -1)
4558 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4560 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4562 create_iv (aggr_ptr_init,
4563 fold_convert (aggr_ptr_type, iv_step),
4564 aggr_ptr, loop, &incr_gsi, insert_after,
4565 &indx_before_incr, &indx_after_incr);
4566 incr = gsi_stmt (incr_gsi);
4567 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4569 /* Copy the points-to information if it exists. */
4570 if (DR_PTR_INFO (dr))
4572 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4573 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4575 if (ptr_incr)
4576 *ptr_incr = incr;
4578 aptr = indx_before_incr;
4581 if (!nested_in_vect_loop || only_init)
4582 return aptr;
4585 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4586 nested in LOOP, if exists. */
4588 gcc_assert (nested_in_vect_loop);
4589 if (!only_init)
4591 standard_iv_increment_position (containing_loop, &incr_gsi,
4592 &insert_after);
4593 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4594 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4595 &indx_after_incr);
4596 incr = gsi_stmt (incr_gsi);
4597 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4599 /* Copy the points-to information if it exists. */
4600 if (DR_PTR_INFO (dr))
4602 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4603 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4605 if (ptr_incr)
4606 *ptr_incr = incr;
4608 return indx_before_incr;
4610 else
4611 gcc_unreachable ();
4615 /* Function bump_vector_ptr
4617 Increment a pointer (to a vector type) by vector-size. If requested,
4618 i.e. if PTR-INCR is given, then also connect the new increment stmt
4619 to the existing def-use update-chain of the pointer, by modifying
4620 the PTR_INCR as illustrated below:
4622 The pointer def-use update-chain before this function:
4623 DATAREF_PTR = phi (p_0, p_2)
4624 ....
4625 PTR_INCR: p_2 = DATAREF_PTR + step
4627 The pointer def-use update-chain after this function:
4628 DATAREF_PTR = phi (p_0, p_2)
4629 ....
4630 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4631 ....
4632 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4634 Input:
4635 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4636 in the loop.
4637 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4638 the loop. The increment amount across iterations is expected
4639 to be vector_size.
4640 BSI - location where the new update stmt is to be placed.
4641 STMT - the original scalar memory-access stmt that is being vectorized.
4642 BUMP - optional. The offset by which to bump the pointer. If not given,
4643 the offset is assumed to be vector_size.
4645 Output: Return NEW_DATAREF_PTR as illustrated above.
4649 tree
4650 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4651 gimple *stmt, tree bump)
4653 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4654 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4655 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4656 tree update = TYPE_SIZE_UNIT (vectype);
4657 gassign *incr_stmt;
4658 ssa_op_iter iter;
4659 use_operand_p use_p;
4660 tree new_dataref_ptr;
4662 if (bump)
4663 update = bump;
4665 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4666 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4667 else
4668 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4669 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4670 dataref_ptr, update);
4671 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4673 /* Copy the points-to information if it exists. */
4674 if (DR_PTR_INFO (dr))
4676 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4677 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4680 if (!ptr_incr)
4681 return new_dataref_ptr;
4683 /* Update the vector-pointer's cross-iteration increment. */
4684 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4686 tree use = USE_FROM_PTR (use_p);
4688 if (use == dataref_ptr)
4689 SET_USE (use_p, new_dataref_ptr);
4690 else
4691 gcc_assert (tree_int_cst_compare (use, update) == 0);
4694 return new_dataref_ptr;
4698 /* Function vect_create_destination_var.
4700 Create a new temporary of type VECTYPE. */
4702 tree
4703 vect_create_destination_var (tree scalar_dest, tree vectype)
4705 tree vec_dest;
4706 const char *name;
4707 char *new_name;
4708 tree type;
4709 enum vect_var_kind kind;
4711 kind = vectype
4712 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4713 ? vect_mask_var
4714 : vect_simple_var
4715 : vect_scalar_var;
4716 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4718 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4720 name = get_name (scalar_dest);
4721 if (name)
4722 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4723 else
4724 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4725 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4726 free (new_name);
4728 return vec_dest;
4731 /* Function vect_grouped_store_supported.
4733 Returns TRUE if interleave high and interleave low permutations
4734 are supported, and FALSE otherwise. */
4736 bool
4737 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4739 machine_mode mode = TYPE_MODE (vectype);
4741 /* vect_permute_store_chain requires the group size to be equal to 3 or
4742 be a power of two. */
4743 if (count != 3 && exact_log2 (count) == -1)
4745 if (dump_enabled_p ())
4746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4747 "the size of the group of accesses"
4748 " is not a power of 2 or not eqaul to 3\n");
4749 return false;
4752 /* Check that the permutation is supported. */
4753 if (VECTOR_MODE_P (mode))
4755 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4756 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4758 if (count == 3)
4760 unsigned int j0 = 0, j1 = 0, j2 = 0;
4761 unsigned int i, j;
4763 for (j = 0; j < 3; j++)
4765 int nelt0 = ((3 - j) * nelt) % 3;
4766 int nelt1 = ((3 - j) * nelt + 1) % 3;
4767 int nelt2 = ((3 - j) * nelt + 2) % 3;
4768 for (i = 0; i < nelt; i++)
4770 if (3 * i + nelt0 < nelt)
4771 sel[3 * i + nelt0] = j0++;
4772 if (3 * i + nelt1 < nelt)
4773 sel[3 * i + nelt1] = nelt + j1++;
4774 if (3 * i + nelt2 < nelt)
4775 sel[3 * i + nelt2] = 0;
4777 if (!can_vec_perm_p (mode, false, sel))
4779 if (dump_enabled_p ())
4780 dump_printf (MSG_MISSED_OPTIMIZATION,
4781 "permutaion op not supported by target.\n");
4782 return false;
4785 for (i = 0; i < nelt; i++)
4787 if (3 * i + nelt0 < nelt)
4788 sel[3 * i + nelt0] = 3 * i + nelt0;
4789 if (3 * i + nelt1 < nelt)
4790 sel[3 * i + nelt1] = 3 * i + nelt1;
4791 if (3 * i + nelt2 < nelt)
4792 sel[3 * i + nelt2] = nelt + j2++;
4794 if (!can_vec_perm_p (mode, false, sel))
4796 if (dump_enabled_p ())
4797 dump_printf (MSG_MISSED_OPTIMIZATION,
4798 "permutaion op not supported by target.\n");
4799 return false;
4802 return true;
4804 else
4806 /* If length is not equal to 3 then only power of 2 is supported. */
4807 gcc_assert (pow2p_hwi (count));
4809 for (i = 0; i < nelt / 2; i++)
4811 sel[i * 2] = i;
4812 sel[i * 2 + 1] = i + nelt;
4814 if (can_vec_perm_p (mode, false, sel))
4816 for (i = 0; i < nelt; i++)
4817 sel[i] += nelt / 2;
4818 if (can_vec_perm_p (mode, false, sel))
4819 return true;
4824 if (dump_enabled_p ())
4825 dump_printf (MSG_MISSED_OPTIMIZATION,
4826 "permutaion op not supported by target.\n");
4827 return false;
4831 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4832 type VECTYPE. */
4834 bool
4835 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4837 return vect_lanes_optab_supported_p ("vec_store_lanes",
4838 vec_store_lanes_optab,
4839 vectype, count);
4843 /* Function vect_permute_store_chain.
4845 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4846 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4847 the data correctly for the stores. Return the final references for stores
4848 in RESULT_CHAIN.
4850 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4851 The input is 4 vectors each containing 8 elements. We assign a number to
4852 each element, the input sequence is:
4854 1st vec: 0 1 2 3 4 5 6 7
4855 2nd vec: 8 9 10 11 12 13 14 15
4856 3rd vec: 16 17 18 19 20 21 22 23
4857 4th vec: 24 25 26 27 28 29 30 31
4859 The output sequence should be:
4861 1st vec: 0 8 16 24 1 9 17 25
4862 2nd vec: 2 10 18 26 3 11 19 27
4863 3rd vec: 4 12 20 28 5 13 21 30
4864 4th vec: 6 14 22 30 7 15 23 31
4866 i.e., we interleave the contents of the four vectors in their order.
4868 We use interleave_high/low instructions to create such output. The input of
4869 each interleave_high/low operation is two vectors:
4870 1st vec 2nd vec
4871 0 1 2 3 4 5 6 7
4872 the even elements of the result vector are obtained left-to-right from the
4873 high/low elements of the first vector. The odd elements of the result are
4874 obtained left-to-right from the high/low elements of the second vector.
4875 The output of interleave_high will be: 0 4 1 5
4876 and of interleave_low: 2 6 3 7
4879 The permutation is done in log LENGTH stages. In each stage interleave_high
4880 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4881 where the first argument is taken from the first half of DR_CHAIN and the
4882 second argument from it's second half.
4883 In our example,
4885 I1: interleave_high (1st vec, 3rd vec)
4886 I2: interleave_low (1st vec, 3rd vec)
4887 I3: interleave_high (2nd vec, 4th vec)
4888 I4: interleave_low (2nd vec, 4th vec)
4890 The output for the first stage is:
4892 I1: 0 16 1 17 2 18 3 19
4893 I2: 4 20 5 21 6 22 7 23
4894 I3: 8 24 9 25 10 26 11 27
4895 I4: 12 28 13 29 14 30 15 31
4897 The output of the second stage, i.e. the final result is:
4899 I1: 0 8 16 24 1 9 17 25
4900 I2: 2 10 18 26 3 11 19 27
4901 I3: 4 12 20 28 5 13 21 30
4902 I4: 6 14 22 30 7 15 23 31. */
4904 void
4905 vect_permute_store_chain (vec<tree> dr_chain,
4906 unsigned int length,
4907 gimple *stmt,
4908 gimple_stmt_iterator *gsi,
4909 vec<tree> *result_chain)
4911 tree vect1, vect2, high, low;
4912 gimple *perm_stmt;
4913 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4914 tree perm_mask_low, perm_mask_high;
4915 tree data_ref;
4916 tree perm3_mask_low, perm3_mask_high;
4917 unsigned int i, n, log_length = exact_log2 (length);
4918 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4919 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4921 result_chain->quick_grow (length);
4922 memcpy (result_chain->address (), dr_chain.address (),
4923 length * sizeof (tree));
4925 if (length == 3)
4927 unsigned int j0 = 0, j1 = 0, j2 = 0;
4929 for (j = 0; j < 3; j++)
4931 int nelt0 = ((3 - j) * nelt) % 3;
4932 int nelt1 = ((3 - j) * nelt + 1) % 3;
4933 int nelt2 = ((3 - j) * nelt + 2) % 3;
4935 for (i = 0; i < nelt; i++)
4937 if (3 * i + nelt0 < nelt)
4938 sel[3 * i + nelt0] = j0++;
4939 if (3 * i + nelt1 < nelt)
4940 sel[3 * i + nelt1] = nelt + j1++;
4941 if (3 * i + nelt2 < nelt)
4942 sel[3 * i + nelt2] = 0;
4944 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4946 for (i = 0; i < nelt; i++)
4948 if (3 * i + nelt0 < nelt)
4949 sel[3 * i + nelt0] = 3 * i + nelt0;
4950 if (3 * i + nelt1 < nelt)
4951 sel[3 * i + nelt1] = 3 * i + nelt1;
4952 if (3 * i + nelt2 < nelt)
4953 sel[3 * i + nelt2] = nelt + j2++;
4955 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4957 vect1 = dr_chain[0];
4958 vect2 = dr_chain[1];
4960 /* Create interleaving stmt:
4961 low = VEC_PERM_EXPR <vect1, vect2,
4962 {j, nelt, *, j + 1, nelt + j + 1, *,
4963 j + 2, nelt + j + 2, *, ...}> */
4964 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4965 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4966 vect2, perm3_mask_low);
4967 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4969 vect1 = data_ref;
4970 vect2 = dr_chain[2];
4971 /* Create interleaving stmt:
4972 low = VEC_PERM_EXPR <vect1, vect2,
4973 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4974 6, 7, nelt + j + 2, ...}> */
4975 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4976 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4977 vect2, perm3_mask_high);
4978 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4979 (*result_chain)[j] = data_ref;
4982 else
4984 /* If length is not equal to 3 then only power of 2 is supported. */
4985 gcc_assert (pow2p_hwi (length));
4987 for (i = 0, n = nelt / 2; i < n; i++)
4989 sel[i * 2] = i;
4990 sel[i * 2 + 1] = i + nelt;
4992 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4994 for (i = 0; i < nelt; i++)
4995 sel[i] += nelt / 2;
4996 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4998 for (i = 0, n = log_length; i < n; i++)
5000 for (j = 0; j < length/2; j++)
5002 vect1 = dr_chain[j];
5003 vect2 = dr_chain[j+length/2];
5005 /* Create interleaving stmt:
5006 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5007 ...}> */
5008 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5009 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5010 vect2, perm_mask_high);
5011 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5012 (*result_chain)[2*j] = high;
5014 /* Create interleaving stmt:
5015 low = VEC_PERM_EXPR <vect1, vect2,
5016 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5017 ...}> */
5018 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5019 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5020 vect2, perm_mask_low);
5021 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5022 (*result_chain)[2*j+1] = low;
5024 memcpy (dr_chain.address (), result_chain->address (),
5025 length * sizeof (tree));
5030 /* Function vect_setup_realignment
5032 This function is called when vectorizing an unaligned load using
5033 the dr_explicit_realign[_optimized] scheme.
5034 This function generates the following code at the loop prolog:
5036 p = initial_addr;
5037 x msq_init = *(floor(p)); # prolog load
5038 realignment_token = call target_builtin;
5039 loop:
5040 x msq = phi (msq_init, ---)
5042 The stmts marked with x are generated only for the case of
5043 dr_explicit_realign_optimized.
5045 The code above sets up a new (vector) pointer, pointing to the first
5046 location accessed by STMT, and a "floor-aligned" load using that pointer.
5047 It also generates code to compute the "realignment-token" (if the relevant
5048 target hook was defined), and creates a phi-node at the loop-header bb
5049 whose arguments are the result of the prolog-load (created by this
5050 function) and the result of a load that takes place in the loop (to be
5051 created by the caller to this function).
5053 For the case of dr_explicit_realign_optimized:
5054 The caller to this function uses the phi-result (msq) to create the
5055 realignment code inside the loop, and sets up the missing phi argument,
5056 as follows:
5057 loop:
5058 msq = phi (msq_init, lsq)
5059 lsq = *(floor(p')); # load in loop
5060 result = realign_load (msq, lsq, realignment_token);
5062 For the case of dr_explicit_realign:
5063 loop:
5064 msq = *(floor(p)); # load in loop
5065 p' = p + (VS-1);
5066 lsq = *(floor(p')); # load in loop
5067 result = realign_load (msq, lsq, realignment_token);
5069 Input:
5070 STMT - (scalar) load stmt to be vectorized. This load accesses
5071 a memory location that may be unaligned.
5072 BSI - place where new code is to be inserted.
5073 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5074 is used.
5076 Output:
5077 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5078 target hook, if defined.
5079 Return value - the result of the loop-header phi node. */
5081 tree
5082 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5083 tree *realignment_token,
5084 enum dr_alignment_support alignment_support_scheme,
5085 tree init_addr,
5086 struct loop **at_loop)
5088 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5089 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5090 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5091 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5092 struct loop *loop = NULL;
5093 edge pe = NULL;
5094 tree scalar_dest = gimple_assign_lhs (stmt);
5095 tree vec_dest;
5096 gimple *inc;
5097 tree ptr;
5098 tree data_ref;
5099 basic_block new_bb;
5100 tree msq_init = NULL_TREE;
5101 tree new_temp;
5102 gphi *phi_stmt;
5103 tree msq = NULL_TREE;
5104 gimple_seq stmts = NULL;
5105 bool inv_p;
5106 bool compute_in_loop = false;
5107 bool nested_in_vect_loop = false;
5108 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5109 struct loop *loop_for_initial_load = NULL;
5111 if (loop_vinfo)
5113 loop = LOOP_VINFO_LOOP (loop_vinfo);
5114 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5117 gcc_assert (alignment_support_scheme == dr_explicit_realign
5118 || alignment_support_scheme == dr_explicit_realign_optimized);
5120 /* We need to generate three things:
5121 1. the misalignment computation
5122 2. the extra vector load (for the optimized realignment scheme).
5123 3. the phi node for the two vectors from which the realignment is
5124 done (for the optimized realignment scheme). */
5126 /* 1. Determine where to generate the misalignment computation.
5128 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5129 calculation will be generated by this function, outside the loop (in the
5130 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5131 caller, inside the loop.
5133 Background: If the misalignment remains fixed throughout the iterations of
5134 the loop, then both realignment schemes are applicable, and also the
5135 misalignment computation can be done outside LOOP. This is because we are
5136 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5137 are a multiple of VS (the Vector Size), and therefore the misalignment in
5138 different vectorized LOOP iterations is always the same.
5139 The problem arises only if the memory access is in an inner-loop nested
5140 inside LOOP, which is now being vectorized using outer-loop vectorization.
5141 This is the only case when the misalignment of the memory access may not
5142 remain fixed throughout the iterations of the inner-loop (as explained in
5143 detail in vect_supportable_dr_alignment). In this case, not only is the
5144 optimized realignment scheme not applicable, but also the misalignment
5145 computation (and generation of the realignment token that is passed to
5146 REALIGN_LOAD) have to be done inside the loop.
5148 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5149 or not, which in turn determines if the misalignment is computed inside
5150 the inner-loop, or outside LOOP. */
5152 if (init_addr != NULL_TREE || !loop_vinfo)
5154 compute_in_loop = true;
5155 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5159 /* 2. Determine where to generate the extra vector load.
5161 For the optimized realignment scheme, instead of generating two vector
5162 loads in each iteration, we generate a single extra vector load in the
5163 preheader of the loop, and in each iteration reuse the result of the
5164 vector load from the previous iteration. In case the memory access is in
5165 an inner-loop nested inside LOOP, which is now being vectorized using
5166 outer-loop vectorization, we need to determine whether this initial vector
5167 load should be generated at the preheader of the inner-loop, or can be
5168 generated at the preheader of LOOP. If the memory access has no evolution
5169 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5170 to be generated inside LOOP (in the preheader of the inner-loop). */
5172 if (nested_in_vect_loop)
5174 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5175 bool invariant_in_outerloop =
5176 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5177 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5179 else
5180 loop_for_initial_load = loop;
5181 if (at_loop)
5182 *at_loop = loop_for_initial_load;
5184 if (loop_for_initial_load)
5185 pe = loop_preheader_edge (loop_for_initial_load);
5187 /* 3. For the case of the optimized realignment, create the first vector
5188 load at the loop preheader. */
5190 if (alignment_support_scheme == dr_explicit_realign_optimized)
5192 /* Create msq_init = *(floor(p1)) in the loop preheader */
5193 gassign *new_stmt;
5195 gcc_assert (!compute_in_loop);
5196 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5197 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5198 NULL_TREE, &init_addr, NULL, &inc,
5199 true, &inv_p);
5200 if (TREE_CODE (ptr) == SSA_NAME)
5201 new_temp = copy_ssa_name (ptr);
5202 else
5203 new_temp = make_ssa_name (TREE_TYPE (ptr));
5204 new_stmt = gimple_build_assign
5205 (new_temp, BIT_AND_EXPR, ptr,
5206 build_int_cst (TREE_TYPE (ptr),
5207 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5208 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5209 gcc_assert (!new_bb);
5210 data_ref
5211 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5212 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5213 new_stmt = gimple_build_assign (vec_dest, data_ref);
5214 new_temp = make_ssa_name (vec_dest, new_stmt);
5215 gimple_assign_set_lhs (new_stmt, new_temp);
5216 if (pe)
5218 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5219 gcc_assert (!new_bb);
5221 else
5222 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5224 msq_init = gimple_assign_lhs (new_stmt);
5227 /* 4. Create realignment token using a target builtin, if available.
5228 It is done either inside the containing loop, or before LOOP (as
5229 determined above). */
5231 if (targetm.vectorize.builtin_mask_for_load)
5233 gcall *new_stmt;
5234 tree builtin_decl;
5236 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5237 if (!init_addr)
5239 /* Generate the INIT_ADDR computation outside LOOP. */
5240 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5241 NULL_TREE, loop);
5242 if (loop)
5244 pe = loop_preheader_edge (loop);
5245 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5246 gcc_assert (!new_bb);
5248 else
5249 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5252 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5253 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5254 vec_dest =
5255 vect_create_destination_var (scalar_dest,
5256 gimple_call_return_type (new_stmt));
5257 new_temp = make_ssa_name (vec_dest, new_stmt);
5258 gimple_call_set_lhs (new_stmt, new_temp);
5260 if (compute_in_loop)
5261 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5262 else
5264 /* Generate the misalignment computation outside LOOP. */
5265 pe = loop_preheader_edge (loop);
5266 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5267 gcc_assert (!new_bb);
5270 *realignment_token = gimple_call_lhs (new_stmt);
5272 /* The result of the CALL_EXPR to this builtin is determined from
5273 the value of the parameter and no global variables are touched
5274 which makes the builtin a "const" function. Requiring the
5275 builtin to have the "const" attribute makes it unnecessary
5276 to call mark_call_clobbered. */
5277 gcc_assert (TREE_READONLY (builtin_decl));
5280 if (alignment_support_scheme == dr_explicit_realign)
5281 return msq;
5283 gcc_assert (!compute_in_loop);
5284 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5287 /* 5. Create msq = phi <msq_init, lsq> in loop */
5289 pe = loop_preheader_edge (containing_loop);
5290 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5291 msq = make_ssa_name (vec_dest);
5292 phi_stmt = create_phi_node (msq, containing_loop->header);
5293 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5295 return msq;
5299 /* Function vect_grouped_load_supported.
5301 COUNT is the size of the load group (the number of statements plus the
5302 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5303 only one statement, with a gap of COUNT - 1.
5305 Returns true if a suitable permute exists. */
5307 bool
5308 vect_grouped_load_supported (tree vectype, bool single_element_p,
5309 unsigned HOST_WIDE_INT count)
5311 machine_mode mode = TYPE_MODE (vectype);
5313 /* If this is single-element interleaving with an element distance
5314 that leaves unused vector loads around punt - we at least create
5315 very sub-optimal code in that case (and blow up memory,
5316 see PR65518). */
5317 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5319 if (dump_enabled_p ())
5320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5321 "single-element interleaving not supported "
5322 "for not adjacent vector loads\n");
5323 return false;
5326 /* vect_permute_load_chain requires the group size to be equal to 3 or
5327 be a power of two. */
5328 if (count != 3 && exact_log2 (count) == -1)
5330 if (dump_enabled_p ())
5331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5332 "the size of the group of accesses"
5333 " is not a power of 2 or not equal to 3\n");
5334 return false;
5337 /* Check that the permutation is supported. */
5338 if (VECTOR_MODE_P (mode))
5340 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5341 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5343 if (count == 3)
5345 unsigned int k;
5346 for (k = 0; k < 3; k++)
5348 for (i = 0; i < nelt; i++)
5349 if (3 * i + k < 2 * nelt)
5350 sel[i] = 3 * i + k;
5351 else
5352 sel[i] = 0;
5353 if (!can_vec_perm_p (mode, false, sel))
5355 if (dump_enabled_p ())
5356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5357 "shuffle of 3 loads is not supported by"
5358 " target\n");
5359 return false;
5361 for (i = 0, j = 0; i < nelt; i++)
5362 if (3 * i + k < 2 * nelt)
5363 sel[i] = i;
5364 else
5365 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5366 if (!can_vec_perm_p (mode, false, sel))
5368 if (dump_enabled_p ())
5369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5370 "shuffle of 3 loads is not supported by"
5371 " target\n");
5372 return false;
5375 return true;
5377 else
5379 /* If length is not equal to 3 then only power of 2 is supported. */
5380 gcc_assert (pow2p_hwi (count));
5381 for (i = 0; i < nelt; i++)
5382 sel[i] = i * 2;
5383 if (can_vec_perm_p (mode, false, sel))
5385 for (i = 0; i < nelt; i++)
5386 sel[i] = i * 2 + 1;
5387 if (can_vec_perm_p (mode, false, sel))
5388 return true;
5393 if (dump_enabled_p ())
5394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5395 "extract even/odd not supported by target\n");
5396 return false;
5399 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5400 type VECTYPE. */
5402 bool
5403 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5405 return vect_lanes_optab_supported_p ("vec_load_lanes",
5406 vec_load_lanes_optab,
5407 vectype, count);
5410 /* Function vect_permute_load_chain.
5412 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5413 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5414 the input data correctly. Return the final references for loads in
5415 RESULT_CHAIN.
5417 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5418 The input is 4 vectors each containing 8 elements. We assign a number to each
5419 element, the input sequence is:
5421 1st vec: 0 1 2 3 4 5 6 7
5422 2nd vec: 8 9 10 11 12 13 14 15
5423 3rd vec: 16 17 18 19 20 21 22 23
5424 4th vec: 24 25 26 27 28 29 30 31
5426 The output sequence should be:
5428 1st vec: 0 4 8 12 16 20 24 28
5429 2nd vec: 1 5 9 13 17 21 25 29
5430 3rd vec: 2 6 10 14 18 22 26 30
5431 4th vec: 3 7 11 15 19 23 27 31
5433 i.e., the first output vector should contain the first elements of each
5434 interleaving group, etc.
5436 We use extract_even/odd instructions to create such output. The input of
5437 each extract_even/odd operation is two vectors
5438 1st vec 2nd vec
5439 0 1 2 3 4 5 6 7
5441 and the output is the vector of extracted even/odd elements. The output of
5442 extract_even will be: 0 2 4 6
5443 and of extract_odd: 1 3 5 7
5446 The permutation is done in log LENGTH stages. In each stage extract_even
5447 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5448 their order. In our example,
5450 E1: extract_even (1st vec, 2nd vec)
5451 E2: extract_odd (1st vec, 2nd vec)
5452 E3: extract_even (3rd vec, 4th vec)
5453 E4: extract_odd (3rd vec, 4th vec)
5455 The output for the first stage will be:
5457 E1: 0 2 4 6 8 10 12 14
5458 E2: 1 3 5 7 9 11 13 15
5459 E3: 16 18 20 22 24 26 28 30
5460 E4: 17 19 21 23 25 27 29 31
5462 In order to proceed and create the correct sequence for the next stage (or
5463 for the correct output, if the second stage is the last one, as in our
5464 example), we first put the output of extract_even operation and then the
5465 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5466 The input for the second stage is:
5468 1st vec (E1): 0 2 4 6 8 10 12 14
5469 2nd vec (E3): 16 18 20 22 24 26 28 30
5470 3rd vec (E2): 1 3 5 7 9 11 13 15
5471 4th vec (E4): 17 19 21 23 25 27 29 31
5473 The output of the second stage:
5475 E1: 0 4 8 12 16 20 24 28
5476 E2: 2 6 10 14 18 22 26 30
5477 E3: 1 5 9 13 17 21 25 29
5478 E4: 3 7 11 15 19 23 27 31
5480 And RESULT_CHAIN after reordering:
5482 1st vec (E1): 0 4 8 12 16 20 24 28
5483 2nd vec (E3): 1 5 9 13 17 21 25 29
5484 3rd vec (E2): 2 6 10 14 18 22 26 30
5485 4th vec (E4): 3 7 11 15 19 23 27 31. */
5487 static void
5488 vect_permute_load_chain (vec<tree> dr_chain,
5489 unsigned int length,
5490 gimple *stmt,
5491 gimple_stmt_iterator *gsi,
5492 vec<tree> *result_chain)
5494 tree data_ref, first_vect, second_vect;
5495 tree perm_mask_even, perm_mask_odd;
5496 tree perm3_mask_low, perm3_mask_high;
5497 gimple *perm_stmt;
5498 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5499 unsigned int i, j, log_length = exact_log2 (length);
5500 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5501 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5503 result_chain->quick_grow (length);
5504 memcpy (result_chain->address (), dr_chain.address (),
5505 length * sizeof (tree));
5507 if (length == 3)
5509 unsigned int k;
5511 for (k = 0; k < 3; k++)
5513 for (i = 0; i < nelt; i++)
5514 if (3 * i + k < 2 * nelt)
5515 sel[i] = 3 * i + k;
5516 else
5517 sel[i] = 0;
5518 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5520 for (i = 0, j = 0; i < nelt; i++)
5521 if (3 * i + k < 2 * nelt)
5522 sel[i] = i;
5523 else
5524 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5526 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5528 first_vect = dr_chain[0];
5529 second_vect = dr_chain[1];
5531 /* Create interleaving stmt (low part of):
5532 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5533 ...}> */
5534 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5535 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5536 second_vect, perm3_mask_low);
5537 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5539 /* Create interleaving stmt (high part of):
5540 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5541 ...}> */
5542 first_vect = data_ref;
5543 second_vect = dr_chain[2];
5544 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5545 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5546 second_vect, perm3_mask_high);
5547 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5548 (*result_chain)[k] = data_ref;
5551 else
5553 /* If length is not equal to 3 then only power of 2 is supported. */
5554 gcc_assert (pow2p_hwi (length));
5556 for (i = 0; i < nelt; ++i)
5557 sel[i] = i * 2;
5558 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5560 for (i = 0; i < nelt; ++i)
5561 sel[i] = i * 2 + 1;
5562 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5564 for (i = 0; i < log_length; i++)
5566 for (j = 0; j < length; j += 2)
5568 first_vect = dr_chain[j];
5569 second_vect = dr_chain[j+1];
5571 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5572 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5573 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5574 first_vect, second_vect,
5575 perm_mask_even);
5576 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5577 (*result_chain)[j/2] = data_ref;
5579 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5580 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5581 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5582 first_vect, second_vect,
5583 perm_mask_odd);
5584 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5585 (*result_chain)[j/2+length/2] = data_ref;
5587 memcpy (dr_chain.address (), result_chain->address (),
5588 length * sizeof (tree));
5593 /* Function vect_shift_permute_load_chain.
5595 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5596 sequence of stmts to reorder the input data accordingly.
5597 Return the final references for loads in RESULT_CHAIN.
5598 Return true if successed, false otherwise.
5600 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5601 The input is 3 vectors each containing 8 elements. We assign a
5602 number to each element, the input sequence is:
5604 1st vec: 0 1 2 3 4 5 6 7
5605 2nd vec: 8 9 10 11 12 13 14 15
5606 3rd vec: 16 17 18 19 20 21 22 23
5608 The output sequence should be:
5610 1st vec: 0 3 6 9 12 15 18 21
5611 2nd vec: 1 4 7 10 13 16 19 22
5612 3rd vec: 2 5 8 11 14 17 20 23
5614 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5616 First we shuffle all 3 vectors to get correct elements order:
5618 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5619 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5620 3rd vec: (16 19 22) (17 20 23) (18 21)
5622 Next we unite and shift vector 3 times:
5624 1st step:
5625 shift right by 6 the concatenation of:
5626 "1st vec" and "2nd vec"
5627 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5628 "2nd vec" and "3rd vec"
5629 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5630 "3rd vec" and "1st vec"
5631 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5632 | New vectors |
5634 So that now new vectors are:
5636 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5637 2nd vec: (10 13) (16 19 22) (17 20 23)
5638 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5640 2nd step:
5641 shift right by 5 the concatenation of:
5642 "1st vec" and "3rd vec"
5643 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5644 "2nd vec" and "1st vec"
5645 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5646 "3rd vec" and "2nd vec"
5647 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5648 | New vectors |
5650 So that now new vectors are:
5652 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5653 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5654 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5656 3rd step:
5657 shift right by 5 the concatenation of:
5658 "1st vec" and "1st vec"
5659 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5660 shift right by 3 the concatenation of:
5661 "2nd vec" and "2nd vec"
5662 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5663 | New vectors |
5665 So that now all vectors are READY:
5666 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5667 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5668 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5670 This algorithm is faster than one in vect_permute_load_chain if:
5671 1. "shift of a concatination" is faster than general permutation.
5672 This is usually so.
5673 2. The TARGET machine can't execute vector instructions in parallel.
5674 This is because each step of the algorithm depends on previous.
5675 The algorithm in vect_permute_load_chain is much more parallel.
5677 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5680 static bool
5681 vect_shift_permute_load_chain (vec<tree> dr_chain,
5682 unsigned int length,
5683 gimple *stmt,
5684 gimple_stmt_iterator *gsi,
5685 vec<tree> *result_chain)
5687 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5688 tree perm2_mask1, perm2_mask2, perm3_mask;
5689 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5690 gimple *perm_stmt;
5692 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5693 unsigned int i;
5694 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5695 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5696 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5697 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5699 result_chain->quick_grow (length);
5700 memcpy (result_chain->address (), dr_chain.address (),
5701 length * sizeof (tree));
5703 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5705 unsigned int j, log_length = exact_log2 (length);
5706 for (i = 0; i < nelt / 2; ++i)
5707 sel[i] = i * 2;
5708 for (i = 0; i < nelt / 2; ++i)
5709 sel[nelt / 2 + i] = i * 2 + 1;
5710 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5712 if (dump_enabled_p ())
5713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5714 "shuffle of 2 fields structure is not \
5715 supported by target\n");
5716 return false;
5718 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5720 for (i = 0; i < nelt / 2; ++i)
5721 sel[i] = i * 2 + 1;
5722 for (i = 0; i < nelt / 2; ++i)
5723 sel[nelt / 2 + i] = i * 2;
5724 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5726 if (dump_enabled_p ())
5727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728 "shuffle of 2 fields structure is not \
5729 supported by target\n");
5730 return false;
5732 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5734 /* Generating permutation constant to shift all elements.
5735 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5736 for (i = 0; i < nelt; i++)
5737 sel[i] = nelt / 2 + i;
5738 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5740 if (dump_enabled_p ())
5741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5742 "shift permutation is not supported by target\n");
5743 return false;
5745 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5747 /* Generating permutation constant to select vector from 2.
5748 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5749 for (i = 0; i < nelt / 2; i++)
5750 sel[i] = i;
5751 for (i = nelt / 2; i < nelt; i++)
5752 sel[i] = nelt + i;
5753 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5755 if (dump_enabled_p ())
5756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5757 "select is not supported by target\n");
5758 return false;
5760 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5762 for (i = 0; i < log_length; i++)
5764 for (j = 0; j < length; j += 2)
5766 first_vect = dr_chain[j];
5767 second_vect = dr_chain[j + 1];
5769 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5770 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5771 first_vect, first_vect,
5772 perm2_mask1);
5773 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5774 vect[0] = data_ref;
5776 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5777 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5778 second_vect, second_vect,
5779 perm2_mask2);
5780 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5781 vect[1] = data_ref;
5783 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5784 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5785 vect[0], vect[1], shift1_mask);
5786 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5787 (*result_chain)[j/2 + length/2] = data_ref;
5789 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5790 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5791 vect[0], vect[1], select_mask);
5792 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5793 (*result_chain)[j/2] = data_ref;
5795 memcpy (dr_chain.address (), result_chain->address (),
5796 length * sizeof (tree));
5798 return true;
5800 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5802 unsigned int k = 0, l = 0;
5804 /* Generating permutation constant to get all elements in rigth order.
5805 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5806 for (i = 0; i < nelt; i++)
5808 if (3 * k + (l % 3) >= nelt)
5810 k = 0;
5811 l += (3 - (nelt % 3));
5813 sel[i] = 3 * k + (l % 3);
5814 k++;
5816 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5818 if (dump_enabled_p ())
5819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5820 "shuffle of 3 fields structure is not \
5821 supported by target\n");
5822 return false;
5824 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5826 /* Generating permutation constant to shift all elements.
5827 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5828 for (i = 0; i < nelt; i++)
5829 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5830 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5832 if (dump_enabled_p ())
5833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5834 "shift permutation is not supported by target\n");
5835 return false;
5837 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5839 /* Generating permutation constant to shift all elements.
5840 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5841 for (i = 0; i < nelt; i++)
5842 sel[i] = 2 * (nelt / 3) + 1 + i;
5843 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5845 if (dump_enabled_p ())
5846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5847 "shift permutation is not supported by target\n");
5848 return false;
5850 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5852 /* Generating permutation constant to shift all elements.
5853 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5854 for (i = 0; i < nelt; i++)
5855 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5856 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5858 if (dump_enabled_p ())
5859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5860 "shift permutation is not supported by target\n");
5861 return false;
5863 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5865 /* Generating permutation constant to shift all elements.
5866 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5867 for (i = 0; i < nelt; i++)
5868 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5869 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5871 if (dump_enabled_p ())
5872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5873 "shift permutation is not supported by target\n");
5874 return false;
5876 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5878 for (k = 0; k < 3; k++)
5880 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5881 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5882 dr_chain[k], dr_chain[k],
5883 perm3_mask);
5884 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5885 vect[k] = data_ref;
5888 for (k = 0; k < 3; k++)
5890 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5891 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5892 vect[k % 3], vect[(k + 1) % 3],
5893 shift1_mask);
5894 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5895 vect_shift[k] = data_ref;
5898 for (k = 0; k < 3; k++)
5900 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5901 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5902 vect_shift[(4 - k) % 3],
5903 vect_shift[(3 - k) % 3],
5904 shift2_mask);
5905 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5906 vect[k] = data_ref;
5909 (*result_chain)[3 - (nelt % 3)] = vect[2];
5911 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5912 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5913 vect[0], shift3_mask);
5914 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5915 (*result_chain)[nelt % 3] = data_ref;
5917 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5918 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5919 vect[1], shift4_mask);
5920 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5921 (*result_chain)[0] = data_ref;
5922 return true;
5924 return false;
5927 /* Function vect_transform_grouped_load.
5929 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5930 to perform their permutation and ascribe the result vectorized statements to
5931 the scalar statements.
5934 void
5935 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5936 gimple_stmt_iterator *gsi)
5938 machine_mode mode;
5939 vec<tree> result_chain = vNULL;
5941 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5942 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5943 vectors, that are ready for vector computation. */
5944 result_chain.create (size);
5946 /* If reassociation width for vector type is 2 or greater target machine can
5947 execute 2 or more vector instructions in parallel. Otherwise try to
5948 get chain for loads group using vect_shift_permute_load_chain. */
5949 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5950 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5951 || pow2p_hwi (size)
5952 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5953 gsi, &result_chain))
5954 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5955 vect_record_grouped_load_vectors (stmt, result_chain);
5956 result_chain.release ();
5959 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5960 generated as part of the vectorization of STMT. Assign the statement
5961 for each vector to the associated scalar statement. */
5963 void
5964 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5966 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5967 gimple *next_stmt, *new_stmt;
5968 unsigned int i, gap_count;
5969 tree tmp_data_ref;
5971 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5972 Since we scan the chain starting from it's first node, their order
5973 corresponds the order of data-refs in RESULT_CHAIN. */
5974 next_stmt = first_stmt;
5975 gap_count = 1;
5976 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5978 if (!next_stmt)
5979 break;
5981 /* Skip the gaps. Loads created for the gaps will be removed by dead
5982 code elimination pass later. No need to check for the first stmt in
5983 the group, since it always exists.
5984 GROUP_GAP is the number of steps in elements from the previous
5985 access (if there is no gap GROUP_GAP is 1). We skip loads that
5986 correspond to the gaps. */
5987 if (next_stmt != first_stmt
5988 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5990 gap_count++;
5991 continue;
5994 while (next_stmt)
5996 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5997 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5998 copies, and we put the new vector statement in the first available
5999 RELATED_STMT. */
6000 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6001 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6002 else
6004 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6006 gimple *prev_stmt =
6007 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6008 gimple *rel_stmt =
6009 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6010 while (rel_stmt)
6012 prev_stmt = rel_stmt;
6013 rel_stmt =
6014 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6017 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6018 new_stmt;
6022 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6023 gap_count = 1;
6024 /* If NEXT_STMT accesses the same DR as the previous statement,
6025 put the same TMP_DATA_REF as its vectorized statement; otherwise
6026 get the next data-ref from RESULT_CHAIN. */
6027 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6028 break;
6033 /* Function vect_force_dr_alignment_p.
6035 Returns whether the alignment of a DECL can be forced to be aligned
6036 on ALIGNMENT bit boundary. */
6038 bool
6039 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6041 if (!VAR_P (decl))
6042 return false;
6044 if (decl_in_symtab_p (decl)
6045 && !symtab_node::get (decl)->can_increase_alignment_p ())
6046 return false;
6048 if (TREE_STATIC (decl))
6049 return (alignment <= MAX_OFILE_ALIGNMENT);
6050 else
6051 return (alignment <= MAX_STACK_ALIGNMENT);
6055 /* Return whether the data reference DR is supported with respect to its
6056 alignment.
6057 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6058 it is aligned, i.e., check if it is possible to vectorize it with different
6059 alignment. */
6061 enum dr_alignment_support
6062 vect_supportable_dr_alignment (struct data_reference *dr,
6063 bool check_aligned_accesses)
6065 gimple *stmt = DR_STMT (dr);
6066 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6067 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6068 machine_mode mode = TYPE_MODE (vectype);
6069 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6070 struct loop *vect_loop = NULL;
6071 bool nested_in_vect_loop = false;
6073 if (aligned_access_p (dr) && !check_aligned_accesses)
6074 return dr_aligned;
6076 /* For now assume all conditional loads/stores support unaligned
6077 access without any special code. */
6078 if (is_gimple_call (stmt)
6079 && gimple_call_internal_p (stmt)
6080 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6081 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6082 return dr_unaligned_supported;
6084 if (loop_vinfo)
6086 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6087 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6090 /* Possibly unaligned access. */
6092 /* We can choose between using the implicit realignment scheme (generating
6093 a misaligned_move stmt) and the explicit realignment scheme (generating
6094 aligned loads with a REALIGN_LOAD). There are two variants to the
6095 explicit realignment scheme: optimized, and unoptimized.
6096 We can optimize the realignment only if the step between consecutive
6097 vector loads is equal to the vector size. Since the vector memory
6098 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6099 is guaranteed that the misalignment amount remains the same throughout the
6100 execution of the vectorized loop. Therefore, we can create the
6101 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6102 at the loop preheader.
6104 However, in the case of outer-loop vectorization, when vectorizing a
6105 memory access in the inner-loop nested within the LOOP that is now being
6106 vectorized, while it is guaranteed that the misalignment of the
6107 vectorized memory access will remain the same in different outer-loop
6108 iterations, it is *not* guaranteed that is will remain the same throughout
6109 the execution of the inner-loop. This is because the inner-loop advances
6110 with the original scalar step (and not in steps of VS). If the inner-loop
6111 step happens to be a multiple of VS, then the misalignment remains fixed
6112 and we can use the optimized realignment scheme. For example:
6114 for (i=0; i<N; i++)
6115 for (j=0; j<M; j++)
6116 s += a[i+j];
6118 When vectorizing the i-loop in the above example, the step between
6119 consecutive vector loads is 1, and so the misalignment does not remain
6120 fixed across the execution of the inner-loop, and the realignment cannot
6121 be optimized (as illustrated in the following pseudo vectorized loop):
6123 for (i=0; i<N; i+=4)
6124 for (j=0; j<M; j++){
6125 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6126 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6127 // (assuming that we start from an aligned address).
6130 We therefore have to use the unoptimized realignment scheme:
6132 for (i=0; i<N; i+=4)
6133 for (j=k; j<M; j+=4)
6134 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6135 // that the misalignment of the initial address is
6136 // 0).
6138 The loop can then be vectorized as follows:
6140 for (k=0; k<4; k++){
6141 rt = get_realignment_token (&vp[k]);
6142 for (i=0; i<N; i+=4){
6143 v1 = vp[i+k];
6144 for (j=k; j<M; j+=4){
6145 v2 = vp[i+j+VS-1];
6146 va = REALIGN_LOAD <v1,v2,rt>;
6147 vs += va;
6148 v1 = v2;
6151 } */
6153 if (DR_IS_READ (dr))
6155 bool is_packed = false;
6156 tree type = (TREE_TYPE (DR_REF (dr)));
6158 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6159 && (!targetm.vectorize.builtin_mask_for_load
6160 || targetm.vectorize.builtin_mask_for_load ()))
6162 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6164 /* If we are doing SLP then the accesses need not have the
6165 same alignment, instead it depends on the SLP group size. */
6166 if (loop_vinfo
6167 && STMT_SLP_TYPE (stmt_info)
6168 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6169 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6170 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6172 else if (!loop_vinfo
6173 || (nested_in_vect_loop
6174 && (TREE_INT_CST_LOW (DR_STEP (dr))
6175 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6176 return dr_explicit_realign;
6177 else
6178 return dr_explicit_realign_optimized;
6180 if (!known_alignment_for_access_p (dr))
6181 is_packed = not_size_aligned (DR_REF (dr));
6183 if (targetm.vectorize.support_vector_misalignment
6184 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6185 /* Can't software pipeline the loads, but can at least do them. */
6186 return dr_unaligned_supported;
6188 else
6190 bool is_packed = false;
6191 tree type = (TREE_TYPE (DR_REF (dr)));
6193 if (!known_alignment_for_access_p (dr))
6194 is_packed = not_size_aligned (DR_REF (dr));
6196 if (targetm.vectorize.support_vector_misalignment
6197 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6198 return dr_unaligned_supported;
6201 /* Unsupported. */
6202 return dr_unaligned_unsupported;