Daily bump.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob177729006e87b5896e015b274638d418bae36613
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
55 /* Return true if load- or store-lanes optab OPTAB is implemented for
56 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
58 static bool
59 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
60 tree vectype, unsigned HOST_WIDE_INT count)
62 machine_mode mode, array_mode;
63 bool limit_p;
65 mode = TYPE_MODE (vectype);
66 limit_p = !targetm.array_mode_supported_p (mode, count);
67 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
68 MODE_INT, limit_p);
70 if (array_mode == BLKmode)
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75 GET_MODE_NAME (mode), count);
76 return false;
79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "cannot use %s<%s><%s>\n", name,
84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 return false;
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE, vect_location,
90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91 GET_MODE_NAME (mode));
93 return true;
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112 types. */
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116 HOST_WIDE_INT *rhs_size_unit)
118 tree scalar_type = gimple_expr_type (stmt);
119 HOST_WIDE_INT lhs, rhs;
121 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
123 if (is_gimple_assign (stmt)
124 && (gimple_assign_cast_p (stmt)
125 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
126 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
127 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
129 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
131 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
132 if (rhs < lhs)
133 scalar_type = rhs_type;
136 *lhs_size_unit = lhs;
137 *rhs_size_unit = rhs;
138 return scalar_type;
142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
143 tested at run-time. Return TRUE if DDR was successfully inserted.
144 Return false if versioning is not supported. */
146 static bool
147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
149 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
151 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
152 return false;
154 if (!runtime_alias_check_p (ddr, loop,
155 optimize_loop_nest_for_speed_p (loop)))
156 return false;
158 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
159 return true;
163 /* Function vect_analyze_data_ref_dependence.
165 Return TRUE if there (might) exist a dependence between a memory-reference
166 DRA and a memory-reference DRB. When versioning for alias may check a
167 dependence at run-time, return FALSE. Adjust *MAX_VF according to
168 the data dependence. */
170 static bool
171 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
172 loop_vec_info loop_vinfo, int *max_vf)
174 unsigned int i;
175 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176 struct data_reference *dra = DDR_A (ddr);
177 struct data_reference *drb = DDR_B (ddr);
178 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
179 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
180 lambda_vector dist_v;
181 unsigned int loop_depth;
183 /* In loop analysis all data references should be vectorizable. */
184 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
185 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
186 gcc_unreachable ();
188 /* Independent data accesses. */
189 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
190 return false;
192 if (dra == drb
193 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
194 return false;
196 /* We do not have to consider dependences between accesses that belong
197 to the same group. */
198 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
199 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
200 return false;
202 /* Even if we have an anti-dependence then, as the vectorized loop covers at
203 least two scalar iterations, there is always also a true dependence.
204 As the vectorizer does not re-order loads and stores we can ignore
205 the anti-dependence if TBAA can disambiguate both DRs similar to the
206 case with known negative distance anti-dependences (positive
207 distance anti-dependences would violate TBAA constraints). */
208 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
209 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
210 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
211 get_alias_set (DR_REF (drb))))
212 return false;
214 /* Unknown data dependence. */
215 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
217 /* If user asserted safelen consecutive iterations can be
218 executed concurrently, assume independence. */
219 if (loop->safelen >= 2)
221 if (loop->safelen < *max_vf)
222 *max_vf = loop->safelen;
223 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
224 return false;
227 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
228 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
233 "versioning for alias not supported for: "
234 "can't determine dependence between ");
235 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
236 DR_REF (dra));
237 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
238 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
239 DR_REF (drb));
240 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
242 return true;
245 if (dump_enabled_p ())
247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
248 "versioning for alias required: "
249 "can't determine dependence between ");
250 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
251 DR_REF (dra));
252 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
253 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
254 DR_REF (drb));
255 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
258 /* Add to list of ddrs that need to be tested at run-time. */
259 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
262 /* Known data dependence. */
263 if (DDR_NUM_DIST_VECTS (ddr) == 0)
265 /* If user asserted safelen consecutive iterations can be
266 executed concurrently, assume independence. */
267 if (loop->safelen >= 2)
269 if (loop->safelen < *max_vf)
270 *max_vf = loop->safelen;
271 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
272 return false;
275 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
276 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 "versioning for alias not supported for: "
282 "bad dist vector for ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (dra));
285 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 DR_REF (drb));
288 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
290 return true;
293 if (dump_enabled_p ())
295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
296 "versioning for alias required: "
297 "bad dist vector for ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
299 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
300 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
303 /* Add to list of ddrs that need to be tested at run-time. */
304 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
307 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
308 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
310 int dist = dist_v[loop_depth];
312 if (dump_enabled_p ())
313 dump_printf_loc (MSG_NOTE, vect_location,
314 "dependence distance = %d.\n", dist);
316 if (dist == 0)
318 if (dump_enabled_p ())
320 dump_printf_loc (MSG_NOTE, vect_location,
321 "dependence distance == 0 between ");
322 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
323 dump_printf (MSG_NOTE, " and ");
324 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
325 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
328 /* When we perform grouped accesses and perform implicit CSE
329 by detecting equal accesses and doing disambiguation with
330 runtime alias tests like for
331 .. = a[i];
332 .. = a[i+1];
333 a[i] = ..;
334 a[i+1] = ..;
335 *p = ..;
336 .. = a[i];
337 .. = a[i+1];
338 where we will end up loading { a[i], a[i+1] } once, make
339 sure that inserting group loads before the first load and
340 stores after the last store will do the right thing.
341 Similar for groups like
342 a[i] = ...;
343 ... = a[i];
344 a[i+1] = ...;
345 where loads from the group interleave with the store. */
346 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
347 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
349 gimple *earlier_stmt;
350 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
351 if (DR_IS_WRITE
352 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356 "READ_WRITE dependence in interleaving."
357 "\n");
358 return true;
362 continue;
365 if (dist > 0 && DDR_REVERSED_P (ddr))
367 /* If DDR_REVERSED_P the order of the data-refs in DDR was
368 reversed (to make distance vector positive), and the actual
369 distance is negative. */
370 if (dump_enabled_p ())
371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
372 "dependence distance negative.\n");
373 /* Record a negative dependence distance to later limit the
374 amount of stmt copying / unrolling we can perform.
375 Only need to handle read-after-write dependence. */
376 if (DR_IS_READ (drb)
377 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
378 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
379 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
380 continue;
383 if (abs (dist) >= 2
384 && abs (dist) < *max_vf)
386 /* The dependence distance requires reduction of the maximal
387 vectorization factor. */
388 *max_vf = abs (dist);
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_NOTE, vect_location,
391 "adjusting maximal vectorization factor to %i\n",
392 *max_vf);
395 if (abs (dist) >= *max_vf)
397 /* Dependence distance does not create dependence, as far as
398 vectorization is concerned, in this case. */
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location,
401 "dependence distance >= VF.\n");
402 continue;
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "not vectorized, possible dependence "
409 "between data-refs ");
410 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
411 dump_printf (MSG_NOTE, " and ");
412 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
413 dump_printf (MSG_NOTE, "\n");
416 return true;
419 return false;
422 /* Function vect_analyze_data_ref_dependences.
424 Examine all the data references in the loop, and make sure there do not
425 exist any data dependences between them. Set *MAX_VF according to
426 the maximum vectorization factor the data dependences allow. */
428 bool
429 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
431 unsigned int i;
432 struct data_dependence_relation *ddr;
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "=== vect_analyze_data_ref_dependences ===\n");
438 LOOP_VINFO_DDRS (loop_vinfo)
439 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
440 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
441 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
442 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
443 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
444 &LOOP_VINFO_DDRS (loop_vinfo),
445 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
446 return false;
448 /* For epilogues we either have no aliases or alias versioning
449 was applied to original loop. Therefore we may just get max_vf
450 using VF of original loop. */
451 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
452 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
453 else
454 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
455 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
456 return false;
458 return true;
462 /* Function vect_slp_analyze_data_ref_dependence.
464 Return TRUE if there (might) exist a dependence between a memory-reference
465 DRA and a memory-reference DRB. When versioning for alias may check a
466 dependence at run-time, return FALSE. Adjust *MAX_VF according to
467 the data dependence. */
469 static bool
470 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
472 struct data_reference *dra = DDR_A (ddr);
473 struct data_reference *drb = DDR_B (ddr);
475 /* We need to check dependences of statements marked as unvectorizable
476 as well, they still can prohibit vectorization. */
478 /* Independent data accesses. */
479 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
480 return false;
482 if (dra == drb)
483 return false;
485 /* Read-read is OK. */
486 if (DR_IS_READ (dra) && DR_IS_READ (drb))
487 return false;
489 /* If dra and drb are part of the same interleaving chain consider
490 them independent. */
491 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
492 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
493 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
494 return false;
496 /* Unknown data dependence. */
497 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
499 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "can't determine dependence between ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
504 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
505 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
506 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
509 else if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE, vect_location,
512 "determined dependence between ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
514 dump_printf (MSG_NOTE, " and ");
515 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
516 dump_printf (MSG_NOTE, "\n");
519 return true;
523 /* Analyze dependences involved in the transform of SLP NODE. STORES
524 contain the vector of scalar stores of this instance if we are
525 disambiguating the loads. */
527 static bool
528 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
529 vec<gimple *> stores, gimple *last_store)
531 /* This walks over all stmts involved in the SLP load/store done
532 in NODE verifying we can sink them up to the last stmt in the
533 group. */
534 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
535 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
537 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
538 if (access == last_access)
539 continue;
540 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
541 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
542 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
544 gimple *stmt = gsi_stmt (gsi);
545 if (! gimple_vuse (stmt)
546 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
547 continue;
549 /* If we couldn't record a (single) data reference for this
550 stmt we have to give up. */
551 /* ??? Here and below if dependence analysis fails we can resort
552 to the alias oracle which can handle more kinds of stmts. */
553 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
554 if (!dr_b)
555 return false;
557 bool dependent = false;
558 /* If we run into a store of this same instance (we've just
559 marked those) then delay dependence checking until we run
560 into the last store because this is where it will have
561 been sunk to (and we verify if we can do that as well). */
562 if (gimple_visited_p (stmt))
564 if (stmt != last_store)
565 continue;
566 unsigned i;
567 gimple *store;
568 FOR_EACH_VEC_ELT (stores, i, store)
570 data_reference *store_dr
571 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
572 ddr_p ddr = initialize_data_dependence_relation
573 (dr_a, store_dr, vNULL);
574 dependent = vect_slp_analyze_data_ref_dependence (ddr);
575 free_dependence_relation (ddr);
576 if (dependent)
577 break;
580 else
582 ddr_p ddr = initialize_data_dependence_relation (dr_a,
583 dr_b, vNULL);
584 dependent = vect_slp_analyze_data_ref_dependence (ddr);
585 free_dependence_relation (ddr);
587 if (dependent)
588 return false;
591 return true;
595 /* Function vect_analyze_data_ref_dependences.
597 Examine all the data references in the basic-block, and make sure there
598 do not exist any data dependences between them. Set *MAX_VF according to
599 the maximum vectorization factor the data dependences allow. */
601 bool
602 vect_slp_analyze_instance_dependence (slp_instance instance)
604 if (dump_enabled_p ())
605 dump_printf_loc (MSG_NOTE, vect_location,
606 "=== vect_slp_analyze_instance_dependence ===\n");
608 /* The stores of this instance are at the root of the SLP tree. */
609 slp_tree store = SLP_INSTANCE_TREE (instance);
610 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
611 store = NULL;
613 /* Verify we can sink stores to the vectorized stmt insert location. */
614 gimple *last_store = NULL;
615 if (store)
617 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
618 return false;
620 /* Mark stores in this instance and remember the last one. */
621 last_store = vect_find_last_scalar_stmt_in_slp (store);
622 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
623 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
626 bool res = true;
628 /* Verify we can sink loads to the vectorized stmt insert location,
629 special-casing stores of this instance. */
630 slp_tree load;
631 unsigned int i;
632 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
633 if (! vect_slp_analyze_node_dependences (instance, load,
634 store
635 ? SLP_TREE_SCALAR_STMTS (store)
636 : vNULL, last_store))
638 res = false;
639 break;
642 /* Unset the visited flag. */
643 if (store)
644 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
645 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
647 return res;
650 /* Function vect_compute_data_ref_alignment
652 Compute the misalignment of the data reference DR.
654 Output:
655 1. If during the misalignment computation it is found that the data reference
656 cannot be vectorized then false is returned.
657 2. DR_MISALIGNMENT (DR) is defined.
659 FOR NOW: No analysis is actually performed. Misalignment is calculated
660 only for trivial cases. TODO. */
662 bool
663 vect_compute_data_ref_alignment (struct data_reference *dr)
665 gimple *stmt = DR_STMT (dr);
666 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
667 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
668 struct loop *loop = NULL;
669 tree ref = DR_REF (dr);
670 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE, vect_location,
674 "vect_compute_data_ref_alignment:\n");
676 if (loop_vinfo)
677 loop = LOOP_VINFO_LOOP (loop_vinfo);
679 /* Initialize misalignment to unknown. */
680 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
682 innermost_loop_behavior *drb = vect_dr_behavior (dr);
683 bool step_preserves_misalignment_p;
685 /* No step for BB vectorization. */
686 if (!loop)
688 gcc_assert (integer_zerop (drb->step));
689 step_preserves_misalignment_p = true;
692 /* In case the dataref is in an inner-loop of the loop that is being
693 vectorized (LOOP), we use the base and misalignment information
694 relative to the outer-loop (LOOP). This is ok only if the misalignment
695 stays the same throughout the execution of the inner-loop, which is why
696 we have to check that the stride of the dataref in the inner-loop evenly
697 divides by the vector size. */
698 else if (nested_in_vect_loop_p (loop, stmt))
700 step_preserves_misalignment_p
701 = (DR_STEP_ALIGNMENT (dr)
702 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
704 if (dump_enabled_p ())
706 if (step_preserves_misalignment_p)
707 dump_printf_loc (MSG_NOTE, vect_location,
708 "inner step divides the vector-size.\n");
709 else
710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
711 "inner step doesn't divide the vector-size.\n");
715 /* Similarly we can only use base and misalignment information relative to
716 an innermost loop if the misalignment stays the same throughout the
717 execution of the loop. As above, this is the case if the stride of
718 the dataref evenly divides by the vector size. */
719 else
721 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
722 step_preserves_misalignment_p
723 = ((DR_STEP_ALIGNMENT (dr) * vf)
724 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
726 if (!step_preserves_misalignment_p && dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "step doesn't divide the vector-size.\n");
731 unsigned int base_alignment = drb->base_alignment;
732 unsigned int base_misalignment = drb->base_misalignment;
733 unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
735 if (drb->offset_alignment < vector_alignment
736 || !step_preserves_misalignment_p
737 /* We need to know whether the step wrt the vectorized loop is
738 negative when computing the starting misalignment below. */
739 || TREE_CODE (drb->step) != INTEGER_CST)
741 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744 "Unknown alignment for access: ");
745 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
746 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
748 return true;
751 if (base_alignment < vector_alignment)
753 tree base = drb->base_address;
754 if (TREE_CODE (base) == ADDR_EXPR)
755 base = TREE_OPERAND (base, 0);
756 if (!vect_can_force_dr_alignment_p (base,
757 vector_alignment * BITS_PER_UNIT))
759 if (dump_enabled_p ())
761 dump_printf_loc (MSG_NOTE, vect_location,
762 "can't force alignment of ref: ");
763 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
764 dump_printf (MSG_NOTE, "\n");
766 return true;
769 if (DECL_USER_ALIGN (base))
771 if (dump_enabled_p ())
773 dump_printf_loc (MSG_NOTE, vect_location,
774 "not forcing alignment of user-aligned "
775 "variable: ");
776 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
777 dump_printf (MSG_NOTE, "\n");
779 return true;
782 /* Force the alignment of the decl.
783 NOTE: This is the only change to the code we make during
784 the analysis phase, before deciding to vectorize the loop. */
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
788 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
789 dump_printf (MSG_NOTE, "\n");
792 DR_VECT_AUX (dr)->base_decl = base;
793 DR_VECT_AUX (dr)->base_misaligned = true;
794 base_misalignment = 0;
796 unsigned int misalignment = (base_misalignment
797 + TREE_INT_CST_LOW (drb->init));
799 /* If this is a backward running DR then first access in the larger
800 vectype actually is N-1 elements before the address in the DR.
801 Adjust misalign accordingly. */
802 if (tree_int_cst_sgn (drb->step) < 0)
803 /* PLUS because STEP is negative. */
804 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
805 * TREE_INT_CST_LOW (drb->step));
807 SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
809 if (dump_enabled_p ())
811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
812 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
813 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
814 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
817 return true;
821 /* Function vect_update_misalignment_for_peel.
822 Sets DR's misalignment
823 - to 0 if it has the same alignment as DR_PEEL,
824 - to the misalignment computed using NPEEL if DR's salignment is known,
825 - to -1 (unknown) otherwise.
827 DR - the data reference whose misalignment is to be adjusted.
828 DR_PEEL - the data reference whose misalignment is being made
829 zero in the vector loop by the peel.
830 NPEEL - the number of iterations in the peel loop if the misalignment
831 of DR_PEEL is known at compile time. */
833 static void
834 vect_update_misalignment_for_peel (struct data_reference *dr,
835 struct data_reference *dr_peel, int npeel)
837 unsigned int i;
838 vec<dr_p> same_aligned_drs;
839 struct data_reference *current_dr;
840 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
841 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
842 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
843 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
845 /* For interleaved data accesses the step in the loop must be multiplied by
846 the size of the interleaving group. */
847 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
848 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
849 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
850 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
852 /* It can be assumed that the data refs with the same alignment as dr_peel
853 are aligned in the vector loop. */
854 same_aligned_drs
855 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
856 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
858 if (current_dr != dr)
859 continue;
860 gcc_assert (!known_alignment_for_access_p (dr)
861 || !known_alignment_for_access_p (dr_peel)
862 || (DR_MISALIGNMENT (dr) / dr_size
863 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
864 SET_DR_MISALIGNMENT (dr, 0);
865 return;
868 if (known_alignment_for_access_p (dr)
869 && known_alignment_for_access_p (dr_peel))
871 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
872 int misal = DR_MISALIGNMENT (dr);
873 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
874 misal += negative ? -npeel * dr_size : npeel * dr_size;
875 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
876 SET_DR_MISALIGNMENT (dr, misal);
877 return;
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
882 "to unknown (-1).\n");
883 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
887 /* Function verify_data_ref_alignment
889 Return TRUE if DR can be handled with respect to alignment. */
891 static bool
892 verify_data_ref_alignment (data_reference_p dr)
894 enum dr_alignment_support supportable_dr_alignment
895 = vect_supportable_dr_alignment (dr, false);
896 if (!supportable_dr_alignment)
898 if (dump_enabled_p ())
900 if (DR_IS_READ (dr))
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "not vectorized: unsupported unaligned load.");
903 else
904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
905 "not vectorized: unsupported unaligned "
906 "store.");
908 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
909 DR_REF (dr));
910 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
912 return false;
915 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
916 dump_printf_loc (MSG_NOTE, vect_location,
917 "Vectorizing an unaligned access.\n");
919 return true;
922 /* Function vect_verify_datarefs_alignment
924 Return TRUE if all data references in the loop can be
925 handled with respect to alignment. */
927 bool
928 vect_verify_datarefs_alignment (loop_vec_info vinfo)
930 vec<data_reference_p> datarefs = vinfo->datarefs;
931 struct data_reference *dr;
932 unsigned int i;
934 FOR_EACH_VEC_ELT (datarefs, i, dr)
936 gimple *stmt = DR_STMT (dr);
937 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
939 if (!STMT_VINFO_RELEVANT_P (stmt_info))
940 continue;
942 /* For interleaving, only the alignment of the first access matters. */
943 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
944 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
945 continue;
947 /* Strided accesses perform only component accesses, alignment is
948 irrelevant for them. */
949 if (STMT_VINFO_STRIDED_P (stmt_info)
950 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
951 continue;
953 if (! verify_data_ref_alignment (dr))
954 return false;
957 return true;
960 /* Given an memory reference EXP return whether its alignment is less
961 than its size. */
963 static bool
964 not_size_aligned (tree exp)
966 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
967 return true;
969 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
970 > get_object_alignment (exp));
973 /* Function vector_alignment_reachable_p
975 Return true if vector alignment for DR is reachable by peeling
976 a few loop iterations. Return false otherwise. */
978 static bool
979 vector_alignment_reachable_p (struct data_reference *dr)
981 gimple *stmt = DR_STMT (dr);
982 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
983 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
985 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
987 /* For interleaved access we peel only if number of iterations in
988 the prolog loop ({VF - misalignment}), is a multiple of the
989 number of the interleaved accesses. */
990 int elem_size, mis_in_elements;
991 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
993 /* FORNOW: handle only known alignment. */
994 if (!known_alignment_for_access_p (dr))
995 return false;
997 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
998 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1000 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1001 return false;
1004 /* If misalignment is known at the compile time then allow peeling
1005 only if natural alignment is reachable through peeling. */
1006 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1008 HOST_WIDE_INT elmsize =
1009 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1010 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_NOTE, vect_location,
1013 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1014 dump_printf (MSG_NOTE,
1015 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1017 if (DR_MISALIGNMENT (dr) % elmsize)
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 "data size does not divide the misalignment.\n");
1022 return false;
1026 if (!known_alignment_for_access_p (dr))
1028 tree type = TREE_TYPE (DR_REF (dr));
1029 bool is_packed = not_size_aligned (DR_REF (dr));
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "Unknown misalignment, %snaturally aligned\n",
1033 is_packed ? "not " : "");
1034 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1037 return true;
1041 /* Calculate the cost of the memory access represented by DR. */
1043 static void
1044 vect_get_data_access_cost (struct data_reference *dr,
1045 unsigned int *inside_cost,
1046 unsigned int *outside_cost,
1047 stmt_vector_for_cost *body_cost_vec)
1049 gimple *stmt = DR_STMT (dr);
1050 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1051 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1052 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1053 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1054 int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly */
1056 if (DR_IS_READ (dr))
1057 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1058 NULL, body_cost_vec, false);
1059 else
1060 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1062 if (dump_enabled_p ())
1063 dump_printf_loc (MSG_NOTE, vect_location,
1064 "vect_get_data_access_cost: inside_cost = %d, "
1065 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1069 typedef struct _vect_peel_info
1071 struct data_reference *dr;
1072 int npeel;
1073 unsigned int count;
1074 } *vect_peel_info;
1076 typedef struct _vect_peel_extended_info
1078 struct _vect_peel_info peel_info;
1079 unsigned int inside_cost;
1080 unsigned int outside_cost;
1081 } *vect_peel_extended_info;
1084 /* Peeling hashtable helpers. */
1086 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1088 static inline hashval_t hash (const _vect_peel_info *);
1089 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1092 inline hashval_t
1093 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1095 return (hashval_t) peel_info->npeel;
1098 inline bool
1099 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1101 return (a->npeel == b->npeel);
1105 /* Insert DR into peeling hash table with NPEEL as key. */
1107 static void
1108 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1109 loop_vec_info loop_vinfo, struct data_reference *dr,
1110 int npeel)
1112 struct _vect_peel_info elem, *slot;
1113 _vect_peel_info **new_slot;
1114 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1116 elem.npeel = npeel;
1117 slot = peeling_htab->find (&elem);
1118 if (slot)
1119 slot->count++;
1120 else
1122 slot = XNEW (struct _vect_peel_info);
1123 slot->npeel = npeel;
1124 slot->dr = dr;
1125 slot->count = 1;
1126 new_slot = peeling_htab->find_slot (slot, INSERT);
1127 *new_slot = slot;
1130 if (!supportable_dr_alignment
1131 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1132 slot->count += VECT_MAX_COST;
1136 /* Traverse peeling hash table to find peeling option that aligns maximum
1137 number of data accesses. */
1140 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1141 _vect_peel_extended_info *max)
1143 vect_peel_info elem = *slot;
1145 if (elem->count > max->peel_info.count
1146 || (elem->count == max->peel_info.count
1147 && max->peel_info.npeel > elem->npeel))
1149 max->peel_info.npeel = elem->npeel;
1150 max->peel_info.count = elem->count;
1151 max->peel_info.dr = elem->dr;
1154 return 1;
1157 /* Get the costs of peeling NPEEL iterations checking data access costs
1158 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1159 misalignment will be zero after peeling. */
1161 static void
1162 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1163 struct data_reference *dr0,
1164 unsigned int *inside_cost,
1165 unsigned int *outside_cost,
1166 stmt_vector_for_cost *body_cost_vec,
1167 unsigned int npeel,
1168 bool unknown_misalignment)
1170 unsigned i;
1171 data_reference *dr;
1173 FOR_EACH_VEC_ELT (datarefs, i, dr)
1175 gimple *stmt = DR_STMT (dr);
1176 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1177 /* For interleaving, only the alignment of the first access
1178 matters. */
1179 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1180 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1181 continue;
1183 /* Strided accesses perform only component accesses, alignment is
1184 irrelevant for them. */
1185 if (STMT_VINFO_STRIDED_P (stmt_info)
1186 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1187 continue;
1189 int save_misalignment;
1190 save_misalignment = DR_MISALIGNMENT (dr);
1191 if (npeel == 0)
1193 else if (unknown_misalignment && dr == dr0)
1194 SET_DR_MISALIGNMENT (dr, 0);
1195 else
1196 vect_update_misalignment_for_peel (dr, dr0, npeel);
1197 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1198 body_cost_vec);
1199 SET_DR_MISALIGNMENT (dr, save_misalignment);
1203 /* Traverse peeling hash table and calculate cost for each peeling option.
1204 Find the one with the lowest cost. */
1207 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1208 _vect_peel_extended_info *min)
1210 vect_peel_info elem = *slot;
1211 int dummy;
1212 unsigned int inside_cost = 0, outside_cost = 0;
1213 gimple *stmt = DR_STMT (elem->dr);
1214 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1215 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1216 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1217 epilogue_cost_vec;
1219 prologue_cost_vec.create (2);
1220 body_cost_vec.create (2);
1221 epilogue_cost_vec.create (2);
1223 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1224 elem->dr, &inside_cost, &outside_cost,
1225 &body_cost_vec, elem->npeel, false);
1227 body_cost_vec.release ();
1229 outside_cost += vect_get_known_peeling_cost
1230 (loop_vinfo, elem->npeel, &dummy,
1231 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1232 &prologue_cost_vec, &epilogue_cost_vec);
1234 /* Prologue and epilogue costs are added to the target model later.
1235 These costs depend only on the scalar iteration cost, the
1236 number of peeling iterations finally chosen, and the number of
1237 misaligned statements. So discard the information found here. */
1238 prologue_cost_vec.release ();
1239 epilogue_cost_vec.release ();
1241 if (inside_cost < min->inside_cost
1242 || (inside_cost == min->inside_cost
1243 && outside_cost < min->outside_cost))
1245 min->inside_cost = inside_cost;
1246 min->outside_cost = outside_cost;
1247 min->peel_info.dr = elem->dr;
1248 min->peel_info.npeel = elem->npeel;
1249 min->peel_info.count = elem->count;
1252 return 1;
1256 /* Choose best peeling option by traversing peeling hash table and either
1257 choosing an option with the lowest cost (if cost model is enabled) or the
1258 option that aligns as many accesses as possible. */
1260 static struct _vect_peel_extended_info
1261 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1262 loop_vec_info loop_vinfo)
1264 struct _vect_peel_extended_info res;
1266 res.peel_info.dr = NULL;
1268 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1270 res.inside_cost = INT_MAX;
1271 res.outside_cost = INT_MAX;
1272 peeling_htab->traverse <_vect_peel_extended_info *,
1273 vect_peeling_hash_get_lowest_cost> (&res);
1275 else
1277 res.peel_info.count = 0;
1278 peeling_htab->traverse <_vect_peel_extended_info *,
1279 vect_peeling_hash_get_most_frequent> (&res);
1280 res.inside_cost = 0;
1281 res.outside_cost = 0;
1284 return res;
1287 /* Return true if the new peeling NPEEL is supported. */
1289 static bool
1290 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1291 unsigned npeel)
1293 unsigned i;
1294 struct data_reference *dr = NULL;
1295 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1296 gimple *stmt;
1297 stmt_vec_info stmt_info;
1298 enum dr_alignment_support supportable_dr_alignment;
1300 /* Ensure that all data refs can be vectorized after the peel. */
1301 FOR_EACH_VEC_ELT (datarefs, i, dr)
1303 int save_misalignment;
1305 if (dr == dr0)
1306 continue;
1308 stmt = DR_STMT (dr);
1309 stmt_info = vinfo_for_stmt (stmt);
1310 /* For interleaving, only the alignment of the first access
1311 matters. */
1312 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1313 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1314 continue;
1316 /* Strided accesses perform only component accesses, alignment is
1317 irrelevant for them. */
1318 if (STMT_VINFO_STRIDED_P (stmt_info)
1319 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1320 continue;
1322 save_misalignment = DR_MISALIGNMENT (dr);
1323 vect_update_misalignment_for_peel (dr, dr0, npeel);
1324 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1325 SET_DR_MISALIGNMENT (dr, save_misalignment);
1327 if (!supportable_dr_alignment)
1328 return false;
1331 return true;
1334 /* Function vect_enhance_data_refs_alignment
1336 This pass will use loop versioning and loop peeling in order to enhance
1337 the alignment of data references in the loop.
1339 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1340 original loop is to be vectorized. Any other loops that are created by
1341 the transformations performed in this pass - are not supposed to be
1342 vectorized. This restriction will be relaxed.
1344 This pass will require a cost model to guide it whether to apply peeling
1345 or versioning or a combination of the two. For example, the scheme that
1346 intel uses when given a loop with several memory accesses, is as follows:
1347 choose one memory access ('p') which alignment you want to force by doing
1348 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1349 other accesses are not necessarily aligned, or (2) use loop versioning to
1350 generate one loop in which all accesses are aligned, and another loop in
1351 which only 'p' is necessarily aligned.
1353 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1354 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1355 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1357 Devising a cost model is the most critical aspect of this work. It will
1358 guide us on which access to peel for, whether to use loop versioning, how
1359 many versions to create, etc. The cost model will probably consist of
1360 generic considerations as well as target specific considerations (on
1361 powerpc for example, misaligned stores are more painful than misaligned
1362 loads).
1364 Here are the general steps involved in alignment enhancements:
1366 -- original loop, before alignment analysis:
1367 for (i=0; i<N; i++){
1368 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1369 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1372 -- After vect_compute_data_refs_alignment:
1373 for (i=0; i<N; i++){
1374 x = q[i]; # DR_MISALIGNMENT(q) = 3
1375 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1378 -- Possibility 1: we do loop versioning:
1379 if (p is aligned) {
1380 for (i=0; i<N; i++){ # loop 1A
1381 x = q[i]; # DR_MISALIGNMENT(q) = 3
1382 p[i] = y; # DR_MISALIGNMENT(p) = 0
1385 else {
1386 for (i=0; i<N; i++){ # loop 1B
1387 x = q[i]; # DR_MISALIGNMENT(q) = 3
1388 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1392 -- Possibility 2: we do loop peeling:
1393 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1394 x = q[i];
1395 p[i] = y;
1397 for (i = 3; i < N; i++){ # loop 2A
1398 x = q[i]; # DR_MISALIGNMENT(q) = 0
1399 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1402 -- Possibility 3: combination of loop peeling and versioning:
1403 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1404 x = q[i];
1405 p[i] = y;
1407 if (p is aligned) {
1408 for (i = 3; i<N; i++){ # loop 3A
1409 x = q[i]; # DR_MISALIGNMENT(q) = 0
1410 p[i] = y; # DR_MISALIGNMENT(p) = 0
1413 else {
1414 for (i = 3; i<N; i++){ # loop 3B
1415 x = q[i]; # DR_MISALIGNMENT(q) = 0
1416 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1420 These loops are later passed to loop_transform to be vectorized. The
1421 vectorizer will use the alignment information to guide the transformation
1422 (whether to generate regular loads/stores, or with special handling for
1423 misalignment). */
1425 bool
1426 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1428 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1429 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1430 enum dr_alignment_support supportable_dr_alignment;
1431 struct data_reference *dr0 = NULL, *first_store = NULL;
1432 struct data_reference *dr;
1433 unsigned int i, j;
1434 bool do_peeling = false;
1435 bool do_versioning = false;
1436 bool stat;
1437 gimple *stmt;
1438 stmt_vec_info stmt_info;
1439 unsigned int npeel = 0;
1440 bool one_misalignment_known = false;
1441 bool one_misalignment_unknown = false;
1442 bool one_dr_unsupportable = false;
1443 struct data_reference *unsupportable_dr = NULL;
1444 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1445 unsigned possible_npeel_number = 1;
1446 tree vectype;
1447 unsigned int nelements, mis, same_align_drs_max = 0;
1448 hash_table<peel_info_hasher> peeling_htab (1);
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_NOTE, vect_location,
1452 "=== vect_enhance_data_refs_alignment ===\n");
1454 /* Reset data so we can safely be called multiple times. */
1455 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1456 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1458 /* While cost model enhancements are expected in the future, the high level
1459 view of the code at this time is as follows:
1461 A) If there is a misaligned access then see if peeling to align
1462 this access can make all data references satisfy
1463 vect_supportable_dr_alignment. If so, update data structures
1464 as needed and return true.
1466 B) If peeling wasn't possible and there is a data reference with an
1467 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1468 then see if loop versioning checks can be used to make all data
1469 references satisfy vect_supportable_dr_alignment. If so, update
1470 data structures as needed and return true.
1472 C) If neither peeling nor versioning were successful then return false if
1473 any data reference does not satisfy vect_supportable_dr_alignment.
1475 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1477 Note, Possibility 3 above (which is peeling and versioning together) is not
1478 being done at this time. */
1480 /* (1) Peeling to force alignment. */
1482 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1483 Considerations:
1484 + How many accesses will become aligned due to the peeling
1485 - How many accesses will become unaligned due to the peeling,
1486 and the cost of misaligned accesses.
1487 - The cost of peeling (the extra runtime checks, the increase
1488 in code size). */
1490 FOR_EACH_VEC_ELT (datarefs, i, dr)
1492 stmt = DR_STMT (dr);
1493 stmt_info = vinfo_for_stmt (stmt);
1495 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1496 continue;
1498 /* For interleaving, only the alignment of the first access
1499 matters. */
1500 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1501 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1502 continue;
1504 /* For invariant accesses there is nothing to enhance. */
1505 if (integer_zerop (DR_STEP (dr)))
1506 continue;
1508 /* Strided accesses perform only component accesses, alignment is
1509 irrelevant for them. */
1510 if (STMT_VINFO_STRIDED_P (stmt_info)
1511 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1512 continue;
1514 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1515 do_peeling = vector_alignment_reachable_p (dr);
1516 if (do_peeling)
1518 if (known_alignment_for_access_p (dr))
1520 unsigned int npeel_tmp = 0;
1521 bool negative = tree_int_cst_compare (DR_STEP (dr),
1522 size_zero_node) < 0;
1524 vectype = STMT_VINFO_VECTYPE (stmt_info);
1525 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1526 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1527 TREE_TYPE (DR_REF (dr))));
1528 if (DR_MISALIGNMENT (dr) != 0)
1529 npeel_tmp = (negative ? (mis - nelements)
1530 : (nelements - mis)) & (nelements - 1);
1532 /* For multiple types, it is possible that the bigger type access
1533 will have more than one peeling option. E.g., a loop with two
1534 types: one of size (vector size / 4), and the other one of
1535 size (vector size / 8). Vectorization factor will 8. If both
1536 accesses are misaligned by 3, the first one needs one scalar
1537 iteration to be aligned, and the second one needs 5. But the
1538 first one will be aligned also by peeling 5 scalar
1539 iterations, and in that case both accesses will be aligned.
1540 Hence, except for the immediate peeling amount, we also want
1541 to try to add full vector size, while we don't exceed
1542 vectorization factor.
1543 We do this automatically for cost model, since we calculate
1544 cost for every peeling option. */
1545 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1547 if (STMT_SLP_TYPE (stmt_info))
1548 possible_npeel_number
1549 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1550 else
1551 possible_npeel_number = vf / nelements;
1553 /* NPEEL_TMP is 0 when there is no misalignment, but also
1554 allow peeling NELEMENTS. */
1555 if (DR_MISALIGNMENT (dr) == 0)
1556 possible_npeel_number++;
1559 /* Save info about DR in the hash table. Also include peeling
1560 amounts according to the explanation above. */
1561 for (j = 0; j < possible_npeel_number; j++)
1563 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1564 dr, npeel_tmp);
1565 npeel_tmp += nelements;
1568 one_misalignment_known = true;
1570 else
1572 /* If we don't know any misalignment values, we prefer
1573 peeling for data-ref that has the maximum number of data-refs
1574 with the same alignment, unless the target prefers to align
1575 stores over load. */
1576 unsigned same_align_drs
1577 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1578 if (!dr0
1579 || same_align_drs_max < same_align_drs)
1581 same_align_drs_max = same_align_drs;
1582 dr0 = dr;
1584 /* For data-refs with the same number of related
1585 accesses prefer the one where the misalign
1586 computation will be invariant in the outermost loop. */
1587 else if (same_align_drs_max == same_align_drs)
1589 struct loop *ivloop0, *ivloop;
1590 ivloop0 = outermost_invariant_loop_for_expr
1591 (loop, DR_BASE_ADDRESS (dr0));
1592 ivloop = outermost_invariant_loop_for_expr
1593 (loop, DR_BASE_ADDRESS (dr));
1594 if ((ivloop && !ivloop0)
1595 || (ivloop && ivloop0
1596 && flow_loop_nested_p (ivloop, ivloop0)))
1597 dr0 = dr;
1600 one_misalignment_unknown = true;
1602 /* Check for data refs with unsupportable alignment that
1603 can be peeled. */
1604 if (!supportable_dr_alignment)
1606 one_dr_unsupportable = true;
1607 unsupportable_dr = dr;
1610 if (!first_store && DR_IS_WRITE (dr))
1611 first_store = dr;
1614 else
1616 if (!aligned_access_p (dr))
1618 if (dump_enabled_p ())
1619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1620 "vector alignment may not be reachable\n");
1621 break;
1626 /* Check if we can possibly peel the loop. */
1627 if (!vect_can_advance_ivs_p (loop_vinfo)
1628 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1629 || loop->inner)
1630 do_peeling = false;
1632 struct _vect_peel_extended_info peel_for_known_alignment;
1633 struct _vect_peel_extended_info peel_for_unknown_alignment;
1634 struct _vect_peel_extended_info best_peel;
1636 peel_for_unknown_alignment.inside_cost = INT_MAX;
1637 peel_for_unknown_alignment.outside_cost = INT_MAX;
1638 peel_for_unknown_alignment.peel_info.count = 0;
1640 if (do_peeling
1641 && one_misalignment_unknown)
1643 /* Check if the target requires to prefer stores over loads, i.e., if
1644 misaligned stores are more expensive than misaligned loads (taking
1645 drs with same alignment into account). */
1646 unsigned int load_inside_cost = 0;
1647 unsigned int load_outside_cost = 0;
1648 unsigned int store_inside_cost = 0;
1649 unsigned int store_outside_cost = 0;
1651 stmt_vector_for_cost dummy;
1652 dummy.create (2);
1653 vect_get_peeling_costs_all_drs (datarefs, dr0,
1654 &load_inside_cost,
1655 &load_outside_cost,
1656 &dummy, vf / 2, true);
1657 dummy.release ();
1659 if (first_store)
1661 dummy.create (2);
1662 vect_get_peeling_costs_all_drs (datarefs, first_store,
1663 &store_inside_cost,
1664 &store_outside_cost,
1665 &dummy, vf / 2, true);
1666 dummy.release ();
1668 else
1670 store_inside_cost = INT_MAX;
1671 store_outside_cost = INT_MAX;
1674 if (load_inside_cost > store_inside_cost
1675 || (load_inside_cost == store_inside_cost
1676 && load_outside_cost > store_outside_cost))
1678 dr0 = first_store;
1679 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1680 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1682 else
1684 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1685 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1688 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1689 prologue_cost_vec.create (2);
1690 epilogue_cost_vec.create (2);
1692 int dummy2;
1693 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1694 (loop_vinfo, vf / 2, &dummy2,
1695 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1696 &prologue_cost_vec, &epilogue_cost_vec);
1698 prologue_cost_vec.release ();
1699 epilogue_cost_vec.release ();
1701 peel_for_unknown_alignment.peel_info.count = 1
1702 + STMT_VINFO_SAME_ALIGN_REFS
1703 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1706 peel_for_unknown_alignment.peel_info.npeel = 0;
1707 peel_for_unknown_alignment.peel_info.dr = dr0;
1709 best_peel = peel_for_unknown_alignment;
1711 peel_for_known_alignment.inside_cost = INT_MAX;
1712 peel_for_known_alignment.outside_cost = INT_MAX;
1713 peel_for_known_alignment.peel_info.count = 0;
1714 peel_for_known_alignment.peel_info.dr = NULL;
1716 if (do_peeling && one_misalignment_known)
1718 /* Peeling is possible, but there is no data access that is not supported
1719 unless aligned. So we try to choose the best possible peeling from
1720 the hash table. */
1721 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1722 (&peeling_htab, loop_vinfo);
1725 /* Compare costs of peeling for known and unknown alignment. */
1726 if (peel_for_known_alignment.peel_info.dr != NULL
1727 && peel_for_unknown_alignment.inside_cost
1728 >= peel_for_known_alignment.inside_cost)
1730 best_peel = peel_for_known_alignment;
1732 /* If the best peeling for known alignment has NPEEL == 0, perform no
1733 peeling at all except if there is an unsupportable dr that we can
1734 align. */
1735 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1736 do_peeling = false;
1739 /* If there is an unsupportable data ref, prefer this over all choices so far
1740 since we'd have to discard a chosen peeling except when it accidentally
1741 aligned the unsupportable data ref. */
1742 if (one_dr_unsupportable)
1743 dr0 = unsupportable_dr;
1744 else if (do_peeling)
1746 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1747 TODO: Use nopeel_outside_cost or get rid of it? */
1748 unsigned nopeel_inside_cost = 0;
1749 unsigned nopeel_outside_cost = 0;
1751 stmt_vector_for_cost dummy;
1752 dummy.create (2);
1753 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1754 &nopeel_outside_cost, &dummy, 0, false);
1755 dummy.release ();
1757 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1758 costs will be recorded. */
1759 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1760 prologue_cost_vec.create (2);
1761 epilogue_cost_vec.create (2);
1763 int dummy2;
1764 nopeel_outside_cost += vect_get_known_peeling_cost
1765 (loop_vinfo, 0, &dummy2,
1766 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1767 &prologue_cost_vec, &epilogue_cost_vec);
1769 prologue_cost_vec.release ();
1770 epilogue_cost_vec.release ();
1772 npeel = best_peel.peel_info.npeel;
1773 dr0 = best_peel.peel_info.dr;
1775 /* If no peeling is not more expensive than the best peeling we
1776 have so far, don't perform any peeling. */
1777 if (nopeel_inside_cost <= best_peel.inside_cost)
1778 do_peeling = false;
1781 if (do_peeling)
1783 stmt = DR_STMT (dr0);
1784 stmt_info = vinfo_for_stmt (stmt);
1785 vectype = STMT_VINFO_VECTYPE (stmt_info);
1786 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1788 if (known_alignment_for_access_p (dr0))
1790 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1791 size_zero_node) < 0;
1792 if (!npeel)
1794 /* Since it's known at compile time, compute the number of
1795 iterations in the peeled loop (the peeling factor) for use in
1796 updating DR_MISALIGNMENT values. The peeling factor is the
1797 vectorization factor minus the misalignment as an element
1798 count. */
1799 mis = DR_MISALIGNMENT (dr0);
1800 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1801 npeel = ((negative ? mis - nelements : nelements - mis)
1802 & (nelements - 1));
1805 /* For interleaved data access every iteration accesses all the
1806 members of the group, therefore we divide the number of iterations
1807 by the group size. */
1808 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1809 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1810 npeel /= GROUP_SIZE (stmt_info);
1812 if (dump_enabled_p ())
1813 dump_printf_loc (MSG_NOTE, vect_location,
1814 "Try peeling by %d\n", npeel);
1817 /* Ensure that all datarefs can be vectorized after the peel. */
1818 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1819 do_peeling = false;
1821 /* Check if all datarefs are supportable and log. */
1822 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1824 stat = vect_verify_datarefs_alignment (loop_vinfo);
1825 if (!stat)
1826 do_peeling = false;
1827 else
1828 return stat;
1831 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1832 if (do_peeling)
1834 unsigned max_allowed_peel
1835 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1836 if (max_allowed_peel != (unsigned)-1)
1838 unsigned max_peel = npeel;
1839 if (max_peel == 0)
1841 gimple *dr_stmt = DR_STMT (dr0);
1842 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1843 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1844 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1846 if (max_peel > max_allowed_peel)
1848 do_peeling = false;
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE, vect_location,
1851 "Disable peeling, max peels reached: %d\n", max_peel);
1856 /* Cost model #2 - if peeling may result in a remaining loop not
1857 iterating enough to be vectorized then do not peel. */
1858 if (do_peeling
1859 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1861 unsigned max_peel
1862 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1863 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1864 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1865 do_peeling = false;
1868 if (do_peeling)
1870 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1871 If the misalignment of DR_i is identical to that of dr0 then set
1872 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1873 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1874 by the peeling factor times the element size of DR_i (MOD the
1875 vectorization factor times the size). Otherwise, the
1876 misalignment of DR_i must be set to unknown. */
1877 FOR_EACH_VEC_ELT (datarefs, i, dr)
1878 if (dr != dr0)
1880 /* Strided accesses perform only component accesses, alignment
1881 is irrelevant for them. */
1882 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1883 if (STMT_VINFO_STRIDED_P (stmt_info)
1884 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1885 continue;
1887 vect_update_misalignment_for_peel (dr, dr0, npeel);
1890 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1891 if (npeel)
1892 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1893 else
1894 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1895 = DR_MISALIGNMENT (dr0);
1896 SET_DR_MISALIGNMENT (dr0, 0);
1897 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE, vect_location,
1900 "Alignment of access forced using peeling.\n");
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "Peeling for alignment will be applied.\n");
1905 /* The inside-loop cost will be accounted for in vectorizable_load
1906 and vectorizable_store correctly with adjusted alignments.
1907 Drop the body_cst_vec on the floor here. */
1908 stat = vect_verify_datarefs_alignment (loop_vinfo);
1909 gcc_assert (stat);
1910 return stat;
1914 /* (2) Versioning to force alignment. */
1916 /* Try versioning if:
1917 1) optimize loop for speed
1918 2) there is at least one unsupported misaligned data ref with an unknown
1919 misalignment, and
1920 3) all misaligned data refs with a known misalignment are supported, and
1921 4) the number of runtime alignment checks is within reason. */
1923 do_versioning =
1924 optimize_loop_nest_for_speed_p (loop)
1925 && (!loop->inner); /* FORNOW */
1927 if (do_versioning)
1929 FOR_EACH_VEC_ELT (datarefs, i, dr)
1931 stmt = DR_STMT (dr);
1932 stmt_info = vinfo_for_stmt (stmt);
1934 /* For interleaving, only the alignment of the first access
1935 matters. */
1936 if (aligned_access_p (dr)
1937 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1938 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1939 continue;
1941 if (STMT_VINFO_STRIDED_P (stmt_info))
1943 /* Strided loads perform only component accesses, alignment is
1944 irrelevant for them. */
1945 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1946 continue;
1947 do_versioning = false;
1948 break;
1951 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1953 if (!supportable_dr_alignment)
1955 gimple *stmt;
1956 int mask;
1957 tree vectype;
1959 if (known_alignment_for_access_p (dr)
1960 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1961 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1963 do_versioning = false;
1964 break;
1967 stmt = DR_STMT (dr);
1968 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1969 gcc_assert (vectype);
1971 /* The rightmost bits of an aligned address must be zeros.
1972 Construct the mask needed for this test. For example,
1973 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1974 mask must be 15 = 0xf. */
1975 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1977 /* FORNOW: use the same mask to test all potentially unaligned
1978 references in the loop. The vectorizer currently supports
1979 a single vector size, see the reference to
1980 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1981 vectorization factor is computed. */
1982 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1983 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1984 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1985 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1986 DR_STMT (dr));
1990 /* Versioning requires at least one misaligned data reference. */
1991 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1992 do_versioning = false;
1993 else if (!do_versioning)
1994 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1997 if (do_versioning)
1999 vec<gimple *> may_misalign_stmts
2000 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2001 gimple *stmt;
2003 /* It can now be assumed that the data references in the statements
2004 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2005 of the loop being vectorized. */
2006 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2008 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2009 dr = STMT_VINFO_DATA_REF (stmt_info);
2010 SET_DR_MISALIGNMENT (dr, 0);
2011 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_NOTE, vect_location,
2013 "Alignment of access forced using versioning.\n");
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_NOTE, vect_location,
2018 "Versioning for alignment will be applied.\n");
2020 /* Peeling and versioning can't be done together at this time. */
2021 gcc_assert (! (do_peeling && do_versioning));
2023 stat = vect_verify_datarefs_alignment (loop_vinfo);
2024 gcc_assert (stat);
2025 return stat;
2028 /* This point is reached if neither peeling nor versioning is being done. */
2029 gcc_assert (! (do_peeling || do_versioning));
2031 stat = vect_verify_datarefs_alignment (loop_vinfo);
2032 return stat;
2036 /* Function vect_find_same_alignment_drs.
2038 Update group and alignment relations according to the chosen
2039 vectorization factor. */
2041 static void
2042 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2044 struct data_reference *dra = DDR_A (ddr);
2045 struct data_reference *drb = DDR_B (ddr);
2046 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2047 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2049 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2050 return;
2052 if (dra == drb)
2053 return;
2055 if (!operand_equal_p (DR_BASE_OBJECT (dra), DR_BASE_OBJECT (drb),
2056 OEP_ADDRESS_OF)
2057 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2058 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2059 return;
2061 /* Two references with distance zero have the same alignment. */
2062 offset_int diff = (wi::to_offset (DR_INIT (dra))
2063 - wi::to_offset (DR_INIT (drb)));
2064 if (diff != 0)
2066 /* Get the wider of the two alignments. */
2067 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2068 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2069 unsigned int max_align = MAX (align_a, align_b);
2071 /* Require the gap to be a multiple of the larger vector alignment. */
2072 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2073 return;
2076 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2077 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2078 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_NOTE, vect_location,
2081 "accesses have the same alignment: ");
2082 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2083 dump_printf (MSG_NOTE, " and ");
2084 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2085 dump_printf (MSG_NOTE, "\n");
2090 /* Function vect_analyze_data_refs_alignment
2092 Analyze the alignment of the data-references in the loop.
2093 Return FALSE if a data reference is found that cannot be vectorized. */
2095 bool
2096 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_NOTE, vect_location,
2100 "=== vect_analyze_data_refs_alignment ===\n");
2102 /* Mark groups of data references with same alignment using
2103 data dependence information. */
2104 vec<ddr_p> ddrs = vinfo->ddrs;
2105 struct data_dependence_relation *ddr;
2106 unsigned int i;
2108 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2109 vect_find_same_alignment_drs (ddr);
2111 vec<data_reference_p> datarefs = vinfo->datarefs;
2112 struct data_reference *dr;
2114 FOR_EACH_VEC_ELT (datarefs, i, dr)
2116 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2117 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2118 && !vect_compute_data_ref_alignment (dr))
2120 /* Strided accesses perform only component accesses, misalignment
2121 information is irrelevant for them. */
2122 if (STMT_VINFO_STRIDED_P (stmt_info)
2123 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2124 continue;
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128 "not vectorized: can't calculate alignment "
2129 "for data ref.\n");
2131 return false;
2135 return true;
2139 /* Analyze alignment of DRs of stmts in NODE. */
2141 static bool
2142 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2144 /* We vectorize from the first scalar stmt in the node unless
2145 the node is permuted in which case we start from the first
2146 element in the group. */
2147 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2148 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2149 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2150 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2152 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2153 if (! vect_compute_data_ref_alignment (dr)
2154 /* For creating the data-ref pointer we need alignment of the
2155 first element anyway. */
2156 || (dr != first_dr
2157 && ! vect_compute_data_ref_alignment (first_dr))
2158 || ! verify_data_ref_alignment (dr))
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "not vectorized: bad data alignment in basic "
2163 "block.\n");
2164 return false;
2167 return true;
2170 /* Function vect_slp_analyze_instance_alignment
2172 Analyze the alignment of the data-references in the SLP instance.
2173 Return FALSE if a data reference is found that cannot be vectorized. */
2175 bool
2176 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_NOTE, vect_location,
2180 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2182 slp_tree node;
2183 unsigned i;
2184 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2185 if (! vect_slp_analyze_and_verify_node_alignment (node))
2186 return false;
2188 node = SLP_INSTANCE_TREE (instance);
2189 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2190 && ! vect_slp_analyze_and_verify_node_alignment
2191 (SLP_INSTANCE_TREE (instance)))
2192 return false;
2194 return true;
2198 /* Analyze groups of accesses: check that DR belongs to a group of
2199 accesses of legal size, step, etc. Detect gaps, single element
2200 interleaving, and other special cases. Set grouped access info.
2201 Collect groups of strided stores for further use in SLP analysis.
2202 Worker for vect_analyze_group_access. */
2204 static bool
2205 vect_analyze_group_access_1 (struct data_reference *dr)
2207 tree step = DR_STEP (dr);
2208 tree scalar_type = TREE_TYPE (DR_REF (dr));
2209 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2210 gimple *stmt = DR_STMT (dr);
2211 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2212 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2213 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2214 HOST_WIDE_INT dr_step = -1;
2215 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2216 bool slp_impossible = false;
2218 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2219 size of the interleaving group (including gaps). */
2220 if (tree_fits_shwi_p (step))
2222 dr_step = tree_to_shwi (step);
2223 /* Check that STEP is a multiple of type size. Otherwise there is
2224 a non-element-sized gap at the end of the group which we
2225 cannot represent in GROUP_GAP or GROUP_SIZE.
2226 ??? As we can handle non-constant step fine here we should
2227 simply remove uses of GROUP_GAP between the last and first
2228 element and instead rely on DR_STEP. GROUP_SIZE then would
2229 simply not include that gap. */
2230 if ((dr_step % type_size) != 0)
2232 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE, vect_location,
2235 "Step ");
2236 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2237 dump_printf (MSG_NOTE,
2238 " is not a multiple of the element size for ");
2239 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2240 dump_printf (MSG_NOTE, "\n");
2242 return false;
2244 groupsize = absu_hwi (dr_step) / type_size;
2246 else
2247 groupsize = 0;
2249 /* Not consecutive access is possible only if it is a part of interleaving. */
2250 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2252 /* Check if it this DR is a part of interleaving, and is a single
2253 element of the group that is accessed in the loop. */
2255 /* Gaps are supported only for loads. STEP must be a multiple of the type
2256 size. The size of the group must be a power of 2. */
2257 if (DR_IS_READ (dr)
2258 && (dr_step % type_size) == 0
2259 && groupsize > 0
2260 && pow2p_hwi (groupsize))
2262 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2263 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2264 GROUP_GAP (stmt_info) = groupsize - 1;
2265 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location,
2268 "Detected single element interleaving ");
2269 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2270 dump_printf (MSG_NOTE, " step ");
2271 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2272 dump_printf (MSG_NOTE, "\n");
2275 return true;
2278 if (dump_enabled_p ())
2280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2281 "not consecutive access ");
2282 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2285 if (bb_vinfo)
2287 /* Mark the statement as unvectorizable. */
2288 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2289 return true;
2292 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2293 STMT_VINFO_STRIDED_P (stmt_info) = true;
2294 return true;
2297 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2299 /* First stmt in the interleaving chain. Check the chain. */
2300 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2301 struct data_reference *data_ref = dr;
2302 unsigned int count = 1;
2303 tree prev_init = DR_INIT (data_ref);
2304 gimple *prev = stmt;
2305 HOST_WIDE_INT diff, gaps = 0;
2307 while (next)
2309 /* Skip same data-refs. In case that two or more stmts share
2310 data-ref (supported only for loads), we vectorize only the first
2311 stmt, and the rest get their vectorized loads from the first
2312 one. */
2313 if (!tree_int_cst_compare (DR_INIT (data_ref),
2314 DR_INIT (STMT_VINFO_DATA_REF (
2315 vinfo_for_stmt (next)))))
2317 if (DR_IS_WRITE (data_ref))
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2321 "Two store stmts share the same dr.\n");
2322 return false;
2325 if (dump_enabled_p ())
2326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2327 "Two or more load stmts share the same dr.\n");
2329 /* For load use the same data-ref load. */
2330 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2332 prev = next;
2333 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2334 continue;
2337 prev = next;
2338 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2340 /* All group members have the same STEP by construction. */
2341 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2343 /* Check that the distance between two accesses is equal to the type
2344 size. Otherwise, we have gaps. */
2345 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2346 - TREE_INT_CST_LOW (prev_init)) / type_size;
2347 if (diff != 1)
2349 /* FORNOW: SLP of accesses with gaps is not supported. */
2350 slp_impossible = true;
2351 if (DR_IS_WRITE (data_ref))
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2355 "interleaved store with gaps\n");
2356 return false;
2359 gaps += diff - 1;
2362 last_accessed_element += diff;
2364 /* Store the gap from the previous member of the group. If there is no
2365 gap in the access, GROUP_GAP is always 1. */
2366 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2368 prev_init = DR_INIT (data_ref);
2369 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2370 /* Count the number of data-refs in the chain. */
2371 count++;
2374 if (groupsize == 0)
2375 groupsize = count + gaps;
2377 /* This could be UINT_MAX but as we are generating code in a very
2378 inefficient way we have to cap earlier. See PR78699 for example. */
2379 if (groupsize > 4096)
2381 if (dump_enabled_p ())
2382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383 "group is too large\n");
2384 return false;
2387 /* Check that the size of the interleaving is equal to count for stores,
2388 i.e., that there are no gaps. */
2389 if (groupsize != count
2390 && !DR_IS_READ (dr))
2392 if (dump_enabled_p ())
2393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2394 "interleaved store with gaps\n");
2395 return false;
2398 /* If there is a gap after the last load in the group it is the
2399 difference between the groupsize and the last accessed
2400 element.
2401 When there is no gap, this difference should be 0. */
2402 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2404 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2405 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_NOTE, vect_location,
2408 "Detected interleaving ");
2409 if (DR_IS_READ (dr))
2410 dump_printf (MSG_NOTE, "load ");
2411 else
2412 dump_printf (MSG_NOTE, "store ");
2413 dump_printf (MSG_NOTE, "of size %u starting with ",
2414 (unsigned)groupsize);
2415 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2416 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2417 dump_printf_loc (MSG_NOTE, vect_location,
2418 "There is a gap of %u elements after the group\n",
2419 GROUP_GAP (vinfo_for_stmt (stmt)));
2422 /* SLP: create an SLP data structure for every interleaving group of
2423 stores for further analysis in vect_analyse_slp. */
2424 if (DR_IS_WRITE (dr) && !slp_impossible)
2426 if (loop_vinfo)
2427 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2428 if (bb_vinfo)
2429 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2433 return true;
2436 /* Analyze groups of accesses: check that DR belongs to a group of
2437 accesses of legal size, step, etc. Detect gaps, single element
2438 interleaving, and other special cases. Set grouped access info.
2439 Collect groups of strided stores for further use in SLP analysis. */
2441 static bool
2442 vect_analyze_group_access (struct data_reference *dr)
2444 if (!vect_analyze_group_access_1 (dr))
2446 /* Dissolve the group if present. */
2447 gimple *next;
2448 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2449 while (stmt)
2451 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2452 next = GROUP_NEXT_ELEMENT (vinfo);
2453 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2454 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2455 stmt = next;
2457 return false;
2459 return true;
2462 /* Analyze the access pattern of the data-reference DR.
2463 In case of non-consecutive accesses call vect_analyze_group_access() to
2464 analyze groups of accesses. */
2466 static bool
2467 vect_analyze_data_ref_access (struct data_reference *dr)
2469 tree step = DR_STEP (dr);
2470 tree scalar_type = TREE_TYPE (DR_REF (dr));
2471 gimple *stmt = DR_STMT (dr);
2472 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2474 struct loop *loop = NULL;
2476 if (loop_vinfo)
2477 loop = LOOP_VINFO_LOOP (loop_vinfo);
2479 if (loop_vinfo && !step)
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2483 "bad data-ref access in loop\n");
2484 return false;
2487 /* Allow loads with zero step in inner-loop vectorization. */
2488 if (loop_vinfo && integer_zerop (step))
2490 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2491 if (!nested_in_vect_loop_p (loop, stmt))
2492 return DR_IS_READ (dr);
2493 /* Allow references with zero step for outer loops marked
2494 with pragma omp simd only - it guarantees absence of
2495 loop-carried dependencies between inner loop iterations. */
2496 if (!loop->force_vectorize)
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE, vect_location,
2500 "zero step in inner loop of nest\n");
2501 return false;
2505 if (loop && nested_in_vect_loop_p (loop, stmt))
2507 /* Interleaved accesses are not yet supported within outer-loop
2508 vectorization for references in the inner-loop. */
2509 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2511 /* For the rest of the analysis we use the outer-loop step. */
2512 step = STMT_VINFO_DR_STEP (stmt_info);
2513 if (integer_zerop (step))
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_NOTE, vect_location,
2517 "zero step in outer loop.\n");
2518 return DR_IS_READ (dr);
2522 /* Consecutive? */
2523 if (TREE_CODE (step) == INTEGER_CST)
2525 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2526 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2527 || (dr_step < 0
2528 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2530 /* Mark that it is not interleaving. */
2531 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2532 return true;
2536 if (loop && nested_in_vect_loop_p (loop, stmt))
2538 if (dump_enabled_p ())
2539 dump_printf_loc (MSG_NOTE, vect_location,
2540 "grouped access in outer loop.\n");
2541 return false;
2545 /* Assume this is a DR handled by non-constant strided load case. */
2546 if (TREE_CODE (step) != INTEGER_CST)
2547 return (STMT_VINFO_STRIDED_P (stmt_info)
2548 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2549 || vect_analyze_group_access (dr)));
2551 /* Not consecutive access - check if it's a part of interleaving group. */
2552 return vect_analyze_group_access (dr);
2555 /* Compare two data-references DRA and DRB to group them into chunks
2556 suitable for grouping. */
2558 static int
2559 dr_group_sort_cmp (const void *dra_, const void *drb_)
2561 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2562 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2563 int cmp;
2565 /* Stabilize sort. */
2566 if (dra == drb)
2567 return 0;
2569 /* DRs in different loops never belong to the same group. */
2570 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2571 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2572 if (loopa != loopb)
2573 return loopa->num < loopb->num ? -1 : 1;
2575 /* Ordering of DRs according to base. */
2576 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2578 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2579 DR_BASE_ADDRESS (drb));
2580 if (cmp != 0)
2581 return cmp;
2584 /* And according to DR_OFFSET. */
2585 if (!dr_equal_offsets_p (dra, drb))
2587 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2588 if (cmp != 0)
2589 return cmp;
2592 /* Put reads before writes. */
2593 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2594 return DR_IS_READ (dra) ? -1 : 1;
2596 /* Then sort after access size. */
2597 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2598 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2600 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2601 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2602 if (cmp != 0)
2603 return cmp;
2606 /* And after step. */
2607 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2609 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2610 if (cmp != 0)
2611 return cmp;
2614 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2615 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2616 if (cmp == 0)
2617 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2618 return cmp;
2621 /* Function vect_analyze_data_ref_accesses.
2623 Analyze the access pattern of all the data references in the loop.
2625 FORNOW: the only access pattern that is considered vectorizable is a
2626 simple step 1 (consecutive) access.
2628 FORNOW: handle only arrays and pointer accesses. */
2630 bool
2631 vect_analyze_data_ref_accesses (vec_info *vinfo)
2633 unsigned int i;
2634 vec<data_reference_p> datarefs = vinfo->datarefs;
2635 struct data_reference *dr;
2637 if (dump_enabled_p ())
2638 dump_printf_loc (MSG_NOTE, vect_location,
2639 "=== vect_analyze_data_ref_accesses ===\n");
2641 if (datarefs.is_empty ())
2642 return true;
2644 /* Sort the array of datarefs to make building the interleaving chains
2645 linear. Don't modify the original vector's order, it is needed for
2646 determining what dependencies are reversed. */
2647 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2648 datarefs_copy.qsort (dr_group_sort_cmp);
2650 /* Build the interleaving chains. */
2651 for (i = 0; i < datarefs_copy.length () - 1;)
2653 data_reference_p dra = datarefs_copy[i];
2654 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2655 stmt_vec_info lastinfo = NULL;
2656 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2658 ++i;
2659 continue;
2661 for (i = i + 1; i < datarefs_copy.length (); ++i)
2663 data_reference_p drb = datarefs_copy[i];
2664 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2665 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2666 break;
2668 /* ??? Imperfect sorting (non-compatible types, non-modulo
2669 accesses, same accesses) can lead to a group to be artificially
2670 split here as we don't just skip over those. If it really
2671 matters we can push those to a worklist and re-iterate
2672 over them. The we can just skip ahead to the next DR here. */
2674 /* DRs in a different loop should not be put into the same
2675 interleaving group. */
2676 if (gimple_bb (DR_STMT (dra))->loop_father
2677 != gimple_bb (DR_STMT (drb))->loop_father)
2678 break;
2680 /* Check that the data-refs have same first location (except init)
2681 and they are both either store or load (not load and store,
2682 not masked loads or stores). */
2683 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2684 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2685 DR_BASE_ADDRESS (drb), 0)
2686 || !dr_equal_offsets_p (dra, drb)
2687 || !gimple_assign_single_p (DR_STMT (dra))
2688 || !gimple_assign_single_p (DR_STMT (drb)))
2689 break;
2691 /* Check that the data-refs have the same constant size. */
2692 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2693 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2694 if (!tree_fits_uhwi_p (sza)
2695 || !tree_fits_uhwi_p (szb)
2696 || !tree_int_cst_equal (sza, szb))
2697 break;
2699 /* Check that the data-refs have the same step. */
2700 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2701 break;
2703 /* Do not place the same access in the interleaving chain twice. */
2704 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2705 break;
2707 /* Check the types are compatible.
2708 ??? We don't distinguish this during sorting. */
2709 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2710 TREE_TYPE (DR_REF (drb))))
2711 break;
2713 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2714 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2715 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2716 gcc_assert (init_a <= init_b);
2718 /* If init_b == init_a + the size of the type * k, we have an
2719 interleaving, and DRA is accessed before DRB. */
2720 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2721 if (type_size_a == 0
2722 || (init_b - init_a) % type_size_a != 0)
2723 break;
2725 /* If we have a store, the accesses are adjacent. This splits
2726 groups into chunks we support (we don't support vectorization
2727 of stores with gaps). */
2728 if (!DR_IS_READ (dra)
2729 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2730 (DR_INIT (datarefs_copy[i-1]))
2731 != type_size_a))
2732 break;
2734 /* If the step (if not zero or non-constant) is greater than the
2735 difference between data-refs' inits this splits groups into
2736 suitable sizes. */
2737 if (tree_fits_shwi_p (DR_STEP (dra)))
2739 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2740 if (step != 0 && step <= (init_b - init_a))
2741 break;
2744 if (dump_enabled_p ())
2746 dump_printf_loc (MSG_NOTE, vect_location,
2747 "Detected interleaving ");
2748 if (DR_IS_READ (dra))
2749 dump_printf (MSG_NOTE, "load ");
2750 else
2751 dump_printf (MSG_NOTE, "store ");
2752 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2753 dump_printf (MSG_NOTE, " and ");
2754 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2755 dump_printf (MSG_NOTE, "\n");
2758 /* Link the found element into the group list. */
2759 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2761 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2762 lastinfo = stmtinfo_a;
2764 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2765 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2766 lastinfo = stmtinfo_b;
2770 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2771 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2772 && !vect_analyze_data_ref_access (dr))
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 "not vectorized: complicated access pattern.\n");
2778 if (is_a <bb_vec_info> (vinfo))
2780 /* Mark the statement as not vectorizable. */
2781 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2782 continue;
2784 else
2786 datarefs_copy.release ();
2787 return false;
2791 datarefs_copy.release ();
2792 return true;
2795 /* Function vect_vfa_segment_size.
2797 Create an expression that computes the size of segment
2798 that will be accessed for a data reference. The functions takes into
2799 account that realignment loads may access one more vector.
2801 Input:
2802 DR: The data reference.
2803 LENGTH_FACTOR: segment length to consider.
2805 Return an expression whose value is the size of segment which will be
2806 accessed by DR. */
2808 static tree
2809 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2811 tree segment_length;
2813 if (integer_zerop (DR_STEP (dr)))
2814 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2815 else
2816 segment_length = size_binop (MULT_EXPR,
2817 fold_convert (sizetype, DR_STEP (dr)),
2818 fold_convert (sizetype, length_factor));
2820 if (vect_supportable_dr_alignment (dr, false)
2821 == dr_explicit_realign_optimized)
2823 tree vector_size = TYPE_SIZE_UNIT
2824 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2826 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2828 return segment_length;
2831 /* Function vect_no_alias_p.
2833 Given data references A and B with equal base and offset, the alias
2834 relation can be decided at compilation time, return TRUE if they do
2835 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2836 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2837 respectively. */
2839 static bool
2840 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2841 tree segment_length_a, tree segment_length_b)
2843 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2844 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2845 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2846 return false;
2848 tree seg_a_min = DR_INIT (a);
2849 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2850 seg_a_min, segment_length_a);
2851 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2852 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2853 [a, a+12) */
2854 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2856 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2857 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2858 seg_a_max, unit_size);
2859 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2860 DR_INIT (a), unit_size);
2862 tree seg_b_min = DR_INIT (b);
2863 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2864 seg_b_min, segment_length_b);
2865 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2867 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2868 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2869 seg_b_max, unit_size);
2870 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2871 DR_INIT (b), unit_size);
2874 if (tree_int_cst_le (seg_a_max, seg_b_min)
2875 || tree_int_cst_le (seg_b_max, seg_a_min))
2876 return true;
2878 return false;
2881 /* Function vect_prune_runtime_alias_test_list.
2883 Prune a list of ddrs to be tested at run-time by versioning for alias.
2884 Merge several alias checks into one if possible.
2885 Return FALSE if resulting list of ddrs is longer then allowed by
2886 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2888 bool
2889 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2891 vec<ddr_p> may_alias_ddrs =
2892 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2893 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2894 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2895 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2896 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2898 ddr_p ddr;
2899 unsigned int i;
2900 tree length_factor;
2902 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_NOTE, vect_location,
2904 "=== vect_prune_runtime_alias_test_list ===\n");
2906 if (may_alias_ddrs.is_empty ())
2907 return true;
2909 comp_alias_ddrs.create (may_alias_ddrs.length ());
2911 /* First, we collect all data ref pairs for aliasing checks. */
2912 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2914 int comp_res;
2915 struct data_reference *dr_a, *dr_b;
2916 gimple *dr_group_first_a, *dr_group_first_b;
2917 tree segment_length_a, segment_length_b;
2918 gimple *stmt_a, *stmt_b;
2920 dr_a = DDR_A (ddr);
2921 stmt_a = DR_STMT (DDR_A (ddr));
2922 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2923 if (dr_group_first_a)
2925 stmt_a = dr_group_first_a;
2926 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2929 dr_b = DDR_B (ddr);
2930 stmt_b = DR_STMT (DDR_B (ddr));
2931 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2932 if (dr_group_first_b)
2934 stmt_b = dr_group_first_b;
2935 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2938 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2939 length_factor = scalar_loop_iters;
2940 else
2941 length_factor = size_int (vect_factor);
2942 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2943 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2945 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2946 DR_BASE_ADDRESS (dr_b));
2947 if (comp_res == 0)
2948 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
2949 DR_OFFSET (dr_b));
2951 /* Alias is known at compilation time. */
2952 if (comp_res == 0
2953 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
2954 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
2955 && TREE_CODE (segment_length_a) == INTEGER_CST
2956 && TREE_CODE (segment_length_b) == INTEGER_CST)
2958 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
2959 continue;
2961 if (dump_enabled_p ())
2962 dump_printf_loc (MSG_NOTE, vect_location,
2963 "not vectorized: compilation time alias.\n");
2965 return false;
2968 dr_with_seg_len_pair_t dr_with_seg_len_pair
2969 (dr_with_seg_len (dr_a, segment_length_a),
2970 dr_with_seg_len (dr_b, segment_length_b));
2972 /* Canonicalize pairs by sorting the two DR members. */
2973 if (comp_res > 0)
2974 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2976 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2979 prune_runtime_alias_test_list (&comp_alias_ddrs,
2980 (unsigned HOST_WIDE_INT) vect_factor);
2981 dump_printf_loc (MSG_NOTE, vect_location,
2982 "improved number of alias checks from %d to %d\n",
2983 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2984 if ((int) comp_alias_ddrs.length () >
2985 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2987 if (dump_enabled_p ())
2988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2989 "number of versioning for alias "
2990 "run-time tests exceeds %d "
2991 "(--param vect-max-version-for-alias-checks)\n",
2992 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
2993 return false;
2996 /* All alias checks have been resolved at compilation time. */
2997 if (!comp_alias_ddrs.length ())
2998 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3000 return true;
3003 /* Return true if a non-affine read or write in STMT is suitable for a
3004 gather load or scatter store. Describe the operation in *INFO if so. */
3006 bool
3007 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3008 gather_scatter_info *info)
3010 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3011 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3012 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3013 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3014 tree offtype = NULL_TREE;
3015 tree decl, base, off;
3016 machine_mode pmode;
3017 int punsignedp, reversep, pvolatilep = 0;
3019 base = DR_REF (dr);
3020 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3021 see if we can use the def stmt of the address. */
3022 if (is_gimple_call (stmt)
3023 && gimple_call_internal_p (stmt)
3024 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3025 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3026 && TREE_CODE (base) == MEM_REF
3027 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3028 && integer_zerop (TREE_OPERAND (base, 1))
3029 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3031 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3032 if (is_gimple_assign (def_stmt)
3033 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3034 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3037 /* The gather and scatter builtins need address of the form
3038 loop_invariant + vector * {1, 2, 4, 8}
3040 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3041 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3042 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3043 multiplications and additions in it. To get a vector, we need
3044 a single SSA_NAME that will be defined in the loop and will
3045 contain everything that is not loop invariant and that can be
3046 vectorized. The following code attempts to find such a preexistng
3047 SSA_NAME OFF and put the loop invariants into a tree BASE
3048 that can be gimplified before the loop. */
3049 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3050 &punsignedp, &reversep, &pvolatilep);
3051 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3053 if (TREE_CODE (base) == MEM_REF)
3055 if (!integer_zerop (TREE_OPERAND (base, 1)))
3057 if (off == NULL_TREE)
3059 offset_int moff = mem_ref_offset (base);
3060 off = wide_int_to_tree (sizetype, moff);
3062 else
3063 off = size_binop (PLUS_EXPR, off,
3064 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3066 base = TREE_OPERAND (base, 0);
3068 else
3069 base = build_fold_addr_expr (base);
3071 if (off == NULL_TREE)
3072 off = size_zero_node;
3074 /* If base is not loop invariant, either off is 0, then we start with just
3075 the constant offset in the loop invariant BASE and continue with base
3076 as OFF, otherwise give up.
3077 We could handle that case by gimplifying the addition of base + off
3078 into some SSA_NAME and use that as off, but for now punt. */
3079 if (!expr_invariant_in_loop_p (loop, base))
3081 if (!integer_zerop (off))
3082 return false;
3083 off = base;
3084 base = size_int (pbitpos / BITS_PER_UNIT);
3086 /* Otherwise put base + constant offset into the loop invariant BASE
3087 and continue with OFF. */
3088 else
3090 base = fold_convert (sizetype, base);
3091 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3094 /* OFF at this point may be either a SSA_NAME or some tree expression
3095 from get_inner_reference. Try to peel off loop invariants from it
3096 into BASE as long as possible. */
3097 STRIP_NOPS (off);
3098 while (offtype == NULL_TREE)
3100 enum tree_code code;
3101 tree op0, op1, add = NULL_TREE;
3103 if (TREE_CODE (off) == SSA_NAME)
3105 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3107 if (expr_invariant_in_loop_p (loop, off))
3108 return false;
3110 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3111 break;
3113 op0 = gimple_assign_rhs1 (def_stmt);
3114 code = gimple_assign_rhs_code (def_stmt);
3115 op1 = gimple_assign_rhs2 (def_stmt);
3117 else
3119 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3120 return false;
3121 code = TREE_CODE (off);
3122 extract_ops_from_tree (off, &code, &op0, &op1);
3124 switch (code)
3126 case POINTER_PLUS_EXPR:
3127 case PLUS_EXPR:
3128 if (expr_invariant_in_loop_p (loop, op0))
3130 add = op0;
3131 off = op1;
3132 do_add:
3133 add = fold_convert (sizetype, add);
3134 if (scale != 1)
3135 add = size_binop (MULT_EXPR, add, size_int (scale));
3136 base = size_binop (PLUS_EXPR, base, add);
3137 continue;
3139 if (expr_invariant_in_loop_p (loop, op1))
3141 add = op1;
3142 off = op0;
3143 goto do_add;
3145 break;
3146 case MINUS_EXPR:
3147 if (expr_invariant_in_loop_p (loop, op1))
3149 add = fold_convert (sizetype, op1);
3150 add = size_binop (MINUS_EXPR, size_zero_node, add);
3151 off = op0;
3152 goto do_add;
3154 break;
3155 case MULT_EXPR:
3156 if (scale == 1 && tree_fits_shwi_p (op1))
3158 scale = tree_to_shwi (op1);
3159 off = op0;
3160 continue;
3162 break;
3163 case SSA_NAME:
3164 off = op0;
3165 continue;
3166 CASE_CONVERT:
3167 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3168 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3169 break;
3170 if (TYPE_PRECISION (TREE_TYPE (op0))
3171 == TYPE_PRECISION (TREE_TYPE (off)))
3173 off = op0;
3174 continue;
3176 if (TYPE_PRECISION (TREE_TYPE (op0))
3177 < TYPE_PRECISION (TREE_TYPE (off)))
3179 off = op0;
3180 offtype = TREE_TYPE (off);
3181 STRIP_NOPS (off);
3182 continue;
3184 break;
3185 default:
3186 break;
3188 break;
3191 /* If at the end OFF still isn't a SSA_NAME or isn't
3192 defined in the loop, punt. */
3193 if (TREE_CODE (off) != SSA_NAME
3194 || expr_invariant_in_loop_p (loop, off))
3195 return false;
3197 if (offtype == NULL_TREE)
3198 offtype = TREE_TYPE (off);
3200 if (DR_IS_READ (dr))
3201 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3202 offtype, scale);
3203 else
3204 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3205 offtype, scale);
3207 if (decl == NULL_TREE)
3208 return false;
3210 info->decl = decl;
3211 info->base = base;
3212 info->offset = off;
3213 info->offset_dt = vect_unknown_def_type;
3214 info->offset_vectype = NULL_TREE;
3215 info->scale = scale;
3216 return true;
3219 /* Function vect_analyze_data_refs.
3221 Find all the data references in the loop or basic block.
3223 The general structure of the analysis of data refs in the vectorizer is as
3224 follows:
3225 1- vect_analyze_data_refs(loop/bb): call
3226 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3227 in the loop/bb and their dependences.
3228 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3229 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3230 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3234 bool
3235 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3237 struct loop *loop = NULL;
3238 unsigned int i;
3239 struct data_reference *dr;
3240 tree scalar_type;
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_NOTE, vect_location,
3244 "=== vect_analyze_data_refs ===\n");
3246 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3247 loop = LOOP_VINFO_LOOP (loop_vinfo);
3249 /* Go through the data-refs, check that the analysis succeeded. Update
3250 pointer from stmt_vec_info struct to DR and vectype. */
3252 vec<data_reference_p> datarefs = vinfo->datarefs;
3253 FOR_EACH_VEC_ELT (datarefs, i, dr)
3255 gimple *stmt;
3256 stmt_vec_info stmt_info;
3257 tree base, offset, init;
3258 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3259 bool simd_lane_access = false;
3260 int vf;
3262 again:
3263 if (!dr || !DR_REF (dr))
3265 if (dump_enabled_p ())
3266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3267 "not vectorized: unhandled data-ref\n");
3268 return false;
3271 stmt = DR_STMT (dr);
3272 stmt_info = vinfo_for_stmt (stmt);
3274 /* Discard clobbers from the dataref vector. We will remove
3275 clobber stmts during vectorization. */
3276 if (gimple_clobber_p (stmt))
3278 free_data_ref (dr);
3279 if (i == datarefs.length () - 1)
3281 datarefs.pop ();
3282 break;
3284 datarefs.ordered_remove (i);
3285 dr = datarefs[i];
3286 goto again;
3289 /* Check that analysis of the data-ref succeeded. */
3290 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3291 || !DR_STEP (dr))
3293 bool maybe_gather
3294 = DR_IS_READ (dr)
3295 && !TREE_THIS_VOLATILE (DR_REF (dr))
3296 && targetm.vectorize.builtin_gather != NULL;
3297 bool maybe_scatter
3298 = DR_IS_WRITE (dr)
3299 && !TREE_THIS_VOLATILE (DR_REF (dr))
3300 && targetm.vectorize.builtin_scatter != NULL;
3301 bool maybe_simd_lane_access
3302 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3304 /* If target supports vector gather loads or scatter stores, or if
3305 this might be a SIMD lane access, see if they can't be used. */
3306 if (is_a <loop_vec_info> (vinfo)
3307 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3308 && !nested_in_vect_loop_p (loop, stmt))
3310 struct data_reference *newdr
3311 = create_data_ref (NULL, loop_containing_stmt (stmt),
3312 DR_REF (dr), stmt, maybe_scatter ? false : true);
3313 gcc_assert (newdr != NULL && DR_REF (newdr));
3314 if (DR_BASE_ADDRESS (newdr)
3315 && DR_OFFSET (newdr)
3316 && DR_INIT (newdr)
3317 && DR_STEP (newdr)
3318 && integer_zerop (DR_STEP (newdr)))
3320 if (maybe_simd_lane_access)
3322 tree off = DR_OFFSET (newdr);
3323 STRIP_NOPS (off);
3324 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3325 && TREE_CODE (off) == MULT_EXPR
3326 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3328 tree step = TREE_OPERAND (off, 1);
3329 off = TREE_OPERAND (off, 0);
3330 STRIP_NOPS (off);
3331 if (CONVERT_EXPR_P (off)
3332 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3333 0)))
3334 < TYPE_PRECISION (TREE_TYPE (off)))
3335 off = TREE_OPERAND (off, 0);
3336 if (TREE_CODE (off) == SSA_NAME)
3338 gimple *def = SSA_NAME_DEF_STMT (off);
3339 tree reft = TREE_TYPE (DR_REF (newdr));
3340 if (is_gimple_call (def)
3341 && gimple_call_internal_p (def)
3342 && (gimple_call_internal_fn (def)
3343 == IFN_GOMP_SIMD_LANE))
3345 tree arg = gimple_call_arg (def, 0);
3346 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3347 arg = SSA_NAME_VAR (arg);
3348 if (arg == loop->simduid
3349 /* For now. */
3350 && tree_int_cst_equal
3351 (TYPE_SIZE_UNIT (reft),
3352 step))
3354 DR_OFFSET (newdr) = ssize_int (0);
3355 DR_STEP (newdr) = step;
3356 DR_OFFSET_ALIGNMENT (newdr)
3357 = BIGGEST_ALIGNMENT;
3358 DR_STEP_ALIGNMENT (newdr)
3359 = highest_pow2_factor (step);
3360 dr = newdr;
3361 simd_lane_access = true;
3367 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3369 dr = newdr;
3370 if (maybe_gather)
3371 gatherscatter = GATHER;
3372 else
3373 gatherscatter = SCATTER;
3376 if (gatherscatter == SG_NONE && !simd_lane_access)
3377 free_data_ref (newdr);
3380 if (gatherscatter == SG_NONE && !simd_lane_access)
3382 if (dump_enabled_p ())
3384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3385 "not vectorized: data ref analysis "
3386 "failed ");
3387 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3390 if (is_a <bb_vec_info> (vinfo))
3391 break;
3393 return false;
3397 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3401 "not vectorized: base addr of dr is a "
3402 "constant\n");
3404 if (is_a <bb_vec_info> (vinfo))
3405 break;
3407 if (gatherscatter != SG_NONE || simd_lane_access)
3408 free_data_ref (dr);
3409 return false;
3412 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3414 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3417 "not vectorized: volatile type ");
3418 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3421 if (is_a <bb_vec_info> (vinfo))
3422 break;
3424 return false;
3427 if (stmt_can_throw_internal (stmt))
3429 if (dump_enabled_p ())
3431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3432 "not vectorized: statement can throw an "
3433 "exception ");
3434 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3437 if (is_a <bb_vec_info> (vinfo))
3438 break;
3440 if (gatherscatter != SG_NONE || simd_lane_access)
3441 free_data_ref (dr);
3442 return false;
3445 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3446 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3448 if (dump_enabled_p ())
3450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3451 "not vectorized: statement is bitfield "
3452 "access ");
3453 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3456 if (is_a <bb_vec_info> (vinfo))
3457 break;
3459 if (gatherscatter != SG_NONE || simd_lane_access)
3460 free_data_ref (dr);
3461 return false;
3464 base = unshare_expr (DR_BASE_ADDRESS (dr));
3465 offset = unshare_expr (DR_OFFSET (dr));
3466 init = unshare_expr (DR_INIT (dr));
3468 if (is_gimple_call (stmt)
3469 && (!gimple_call_internal_p (stmt)
3470 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3471 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3473 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3476 "not vectorized: dr in a call ");
3477 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3480 if (is_a <bb_vec_info> (vinfo))
3481 break;
3483 if (gatherscatter != SG_NONE || simd_lane_access)
3484 free_data_ref (dr);
3485 return false;
3488 /* Update DR field in stmt_vec_info struct. */
3490 /* If the dataref is in an inner-loop of the loop that is considered for
3491 for vectorization, we also want to analyze the access relative to
3492 the outer-loop (DR contains information only relative to the
3493 inner-most enclosing loop). We do that by building a reference to the
3494 first location accessed by the inner-loop, and analyze it relative to
3495 the outer-loop. */
3496 if (loop && nested_in_vect_loop_p (loop, stmt))
3498 /* Build a reference to the first location accessed by the
3499 inner loop: *(BASE + INIT + OFFSET). By construction,
3500 this address must be invariant in the inner loop, so we
3501 can consider it as being used in the outer loop. */
3502 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3503 init, offset);
3504 tree init_addr = fold_build_pointer_plus (base, init_offset);
3505 tree init_ref = build_fold_indirect_ref (init_addr);
3507 if (dump_enabled_p ())
3509 dump_printf_loc (MSG_NOTE, vect_location,
3510 "analyze in outer loop: ");
3511 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3512 dump_printf (MSG_NOTE, "\n");
3515 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3516 init_ref, loop))
3517 /* dr_analyze_innermost already explained the failure. */
3518 return false;
3520 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE, vect_location,
3523 "\touter base_address: ");
3524 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3525 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3526 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3527 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3528 STMT_VINFO_DR_OFFSET (stmt_info));
3529 dump_printf (MSG_NOTE,
3530 "\n\touter constant offset from base address: ");
3531 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3532 STMT_VINFO_DR_INIT (stmt_info));
3533 dump_printf (MSG_NOTE, "\n\touter step: ");
3534 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3535 STMT_VINFO_DR_STEP (stmt_info));
3536 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3537 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3538 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3539 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3540 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3541 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3542 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3543 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3547 if (STMT_VINFO_DATA_REF (stmt_info))
3549 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3552 "not vectorized: more than one data ref "
3553 "in stmt: ");
3554 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3557 if (is_a <bb_vec_info> (vinfo))
3558 break;
3560 if (gatherscatter != SG_NONE || simd_lane_access)
3561 free_data_ref (dr);
3562 return false;
3565 STMT_VINFO_DATA_REF (stmt_info) = dr;
3566 if (simd_lane_access)
3568 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3569 free_data_ref (datarefs[i]);
3570 datarefs[i] = dr;
3573 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3574 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3575 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3577 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3580 "not vectorized: base object not addressable "
3581 "for stmt: ");
3582 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3584 if (is_a <bb_vec_info> (vinfo))
3586 /* In BB vectorization the ref can still participate
3587 in dependence analysis, we just can't vectorize it. */
3588 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3589 continue;
3591 return false;
3594 /* Set vectype for STMT. */
3595 scalar_type = TREE_TYPE (DR_REF (dr));
3596 STMT_VINFO_VECTYPE (stmt_info)
3597 = get_vectype_for_scalar_type (scalar_type);
3598 if (!STMT_VINFO_VECTYPE (stmt_info))
3600 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603 "not vectorized: no vectype for stmt: ");
3604 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3605 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3606 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3607 scalar_type);
3608 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3611 if (is_a <bb_vec_info> (vinfo))
3613 /* No vector type is fine, the ref can still participate
3614 in dependence analysis, we just can't vectorize it. */
3615 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3616 continue;
3619 if (gatherscatter != SG_NONE || simd_lane_access)
3621 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3622 if (gatherscatter != SG_NONE)
3623 free_data_ref (dr);
3625 return false;
3627 else
3629 if (dump_enabled_p ())
3631 dump_printf_loc (MSG_NOTE, vect_location,
3632 "got vectype for stmt: ");
3633 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3634 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3635 STMT_VINFO_VECTYPE (stmt_info));
3636 dump_printf (MSG_NOTE, "\n");
3640 /* Adjust the minimal vectorization factor according to the
3641 vector type. */
3642 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3643 if (vf > *min_vf)
3644 *min_vf = vf;
3646 if (gatherscatter != SG_NONE)
3648 gather_scatter_info gs_info;
3649 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3650 &gs_info)
3651 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3653 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3654 free_data_ref (dr);
3655 if (dump_enabled_p ())
3657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3658 (gatherscatter == GATHER) ?
3659 "not vectorized: not suitable for gather "
3660 "load " :
3661 "not vectorized: not suitable for scatter "
3662 "store ");
3663 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3665 return false;
3668 free_data_ref (datarefs[i]);
3669 datarefs[i] = dr;
3670 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3673 else if (is_a <loop_vec_info> (vinfo)
3674 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3676 if (nested_in_vect_loop_p (loop, stmt))
3678 if (dump_enabled_p ())
3680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3681 "not vectorized: not suitable for strided "
3682 "load ");
3683 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3685 return false;
3687 STMT_VINFO_STRIDED_P (stmt_info) = true;
3691 /* If we stopped analysis at the first dataref we could not analyze
3692 when trying to vectorize a basic-block mark the rest of the datarefs
3693 as not vectorizable and truncate the vector of datarefs. That
3694 avoids spending useless time in analyzing their dependence. */
3695 if (i != datarefs.length ())
3697 gcc_assert (is_a <bb_vec_info> (vinfo));
3698 for (unsigned j = i; j < datarefs.length (); ++j)
3700 data_reference_p dr = datarefs[j];
3701 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3702 free_data_ref (dr);
3704 datarefs.truncate (i);
3707 return true;
3711 /* Function vect_get_new_vect_var.
3713 Returns a name for a new variable. The current naming scheme appends the
3714 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3715 the name of vectorizer generated variables, and appends that to NAME if
3716 provided. */
3718 tree
3719 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3721 const char *prefix;
3722 tree new_vect_var;
3724 switch (var_kind)
3726 case vect_simple_var:
3727 prefix = "vect";
3728 break;
3729 case vect_scalar_var:
3730 prefix = "stmp";
3731 break;
3732 case vect_mask_var:
3733 prefix = "mask";
3734 break;
3735 case vect_pointer_var:
3736 prefix = "vectp";
3737 break;
3738 default:
3739 gcc_unreachable ();
3742 if (name)
3744 char* tmp = concat (prefix, "_", name, NULL);
3745 new_vect_var = create_tmp_reg (type, tmp);
3746 free (tmp);
3748 else
3749 new_vect_var = create_tmp_reg (type, prefix);
3751 return new_vect_var;
3754 /* Like vect_get_new_vect_var but return an SSA name. */
3756 tree
3757 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3759 const char *prefix;
3760 tree new_vect_var;
3762 switch (var_kind)
3764 case vect_simple_var:
3765 prefix = "vect";
3766 break;
3767 case vect_scalar_var:
3768 prefix = "stmp";
3769 break;
3770 case vect_pointer_var:
3771 prefix = "vectp";
3772 break;
3773 default:
3774 gcc_unreachable ();
3777 if (name)
3779 char* tmp = concat (prefix, "_", name, NULL);
3780 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3781 free (tmp);
3783 else
3784 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3786 return new_vect_var;
3789 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3791 static void
3792 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3793 stmt_vec_info stmt_info)
3795 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3796 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3797 int misalign = DR_MISALIGNMENT (dr);
3798 if (misalign == DR_MISALIGNMENT_UNKNOWN)
3799 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3800 else
3801 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3804 /* Function vect_create_addr_base_for_vector_ref.
3806 Create an expression that computes the address of the first memory location
3807 that will be accessed for a data reference.
3809 Input:
3810 STMT: The statement containing the data reference.
3811 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3812 OFFSET: Optional. If supplied, it is be added to the initial address.
3813 LOOP: Specify relative to which loop-nest should the address be computed.
3814 For example, when the dataref is in an inner-loop nested in an
3815 outer-loop that is now being vectorized, LOOP can be either the
3816 outer-loop, or the inner-loop. The first memory location accessed
3817 by the following dataref ('in' points to short):
3819 for (i=0; i<N; i++)
3820 for (j=0; j<M; j++)
3821 s += in[i+j]
3823 is as follows:
3824 if LOOP=i_loop: &in (relative to i_loop)
3825 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3826 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3827 initial address. Unlike OFFSET, which is number of elements to
3828 be added, BYTE_OFFSET is measured in bytes.
3830 Output:
3831 1. Return an SSA_NAME whose value is the address of the memory location of
3832 the first vector of the data reference.
3833 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3834 these statement(s) which define the returned SSA_NAME.
3836 FORNOW: We are only handling array accesses with step 1. */
3838 tree
3839 vect_create_addr_base_for_vector_ref (gimple *stmt,
3840 gimple_seq *new_stmt_list,
3841 tree offset,
3842 tree byte_offset)
3844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3845 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3846 const char *base_name;
3847 tree addr_base;
3848 tree dest;
3849 gimple_seq seq = NULL;
3850 tree vect_ptr_type;
3851 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3852 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3853 innermost_loop_behavior *drb = vect_dr_behavior (dr);
3855 tree data_ref_base = unshare_expr (drb->base_address);
3856 tree base_offset = unshare_expr (drb->offset);
3857 tree init = unshare_expr (drb->init);
3859 if (loop_vinfo)
3860 base_name = get_name (data_ref_base);
3861 else
3863 base_offset = ssize_int (0);
3864 init = ssize_int (0);
3865 base_name = get_name (DR_REF (dr));
3868 /* Create base_offset */
3869 base_offset = size_binop (PLUS_EXPR,
3870 fold_convert (sizetype, base_offset),
3871 fold_convert (sizetype, init));
3873 if (offset)
3875 offset = fold_build2 (MULT_EXPR, sizetype,
3876 fold_convert (sizetype, offset), step);
3877 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3878 base_offset, offset);
3880 if (byte_offset)
3882 byte_offset = fold_convert (sizetype, byte_offset);
3883 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3884 base_offset, byte_offset);
3887 /* base + base_offset */
3888 if (loop_vinfo)
3889 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3890 else
3892 addr_base = build1 (ADDR_EXPR,
3893 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3894 unshare_expr (DR_REF (dr)));
3897 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3898 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3899 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3900 gimple_seq_add_seq (new_stmt_list, seq);
3902 if (DR_PTR_INFO (dr)
3903 && TREE_CODE (addr_base) == SSA_NAME
3904 && !SSA_NAME_PTR_INFO (addr_base))
3906 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3907 if (offset || byte_offset)
3908 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3911 if (dump_enabled_p ())
3913 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3914 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3915 dump_printf (MSG_NOTE, "\n");
3918 return addr_base;
3922 /* Function vect_create_data_ref_ptr.
3924 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3925 location accessed in the loop by STMT, along with the def-use update
3926 chain to appropriately advance the pointer through the loop iterations.
3927 Also set aliasing information for the pointer. This pointer is used by
3928 the callers to this function to create a memory reference expression for
3929 vector load/store access.
3931 Input:
3932 1. STMT: a stmt that references memory. Expected to be of the form
3933 GIMPLE_ASSIGN <name, data-ref> or
3934 GIMPLE_ASSIGN <data-ref, name>.
3935 2. AGGR_TYPE: the type of the reference, which should be either a vector
3936 or an array.
3937 3. AT_LOOP: the loop where the vector memref is to be created.
3938 4. OFFSET (optional): an offset to be added to the initial address accessed
3939 by the data-ref in STMT.
3940 5. BSI: location where the new stmts are to be placed if there is no loop
3941 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3942 pointing to the initial address.
3943 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3944 to the initial address accessed by the data-ref in STMT. This is
3945 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3946 in bytes.
3948 Output:
3949 1. Declare a new ptr to vector_type, and have it point to the base of the
3950 data reference (initial addressed accessed by the data reference).
3951 For example, for vector of type V8HI, the following code is generated:
3953 v8hi *ap;
3954 ap = (v8hi *)initial_address;
3956 if OFFSET is not supplied:
3957 initial_address = &a[init];
3958 if OFFSET is supplied:
3959 initial_address = &a[init + OFFSET];
3960 if BYTE_OFFSET is supplied:
3961 initial_address = &a[init] + BYTE_OFFSET;
3963 Return the initial_address in INITIAL_ADDRESS.
3965 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3966 update the pointer in each iteration of the loop.
3968 Return the increment stmt that updates the pointer in PTR_INCR.
3970 3. Set INV_P to true if the access pattern of the data reference in the
3971 vectorized loop is invariant. Set it to false otherwise.
3973 4. Return the pointer. */
3975 tree
3976 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
3977 tree offset, tree *initial_address,
3978 gimple_stmt_iterator *gsi, gimple **ptr_incr,
3979 bool only_init, bool *inv_p, tree byte_offset)
3981 const char *base_name;
3982 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3983 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3984 struct loop *loop = NULL;
3985 bool nested_in_vect_loop = false;
3986 struct loop *containing_loop = NULL;
3987 tree aggr_ptr_type;
3988 tree aggr_ptr;
3989 tree new_temp;
3990 gimple_seq new_stmt_list = NULL;
3991 edge pe = NULL;
3992 basic_block new_bb;
3993 tree aggr_ptr_init;
3994 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3995 tree aptr;
3996 gimple_stmt_iterator incr_gsi;
3997 bool insert_after;
3998 tree indx_before_incr, indx_after_incr;
3999 gimple *incr;
4000 tree step;
4001 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4003 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4004 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4006 if (loop_vinfo)
4008 loop = LOOP_VINFO_LOOP (loop_vinfo);
4009 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4010 containing_loop = (gimple_bb (stmt))->loop_father;
4011 pe = loop_preheader_edge (loop);
4013 else
4015 gcc_assert (bb_vinfo);
4016 only_init = true;
4017 *ptr_incr = NULL;
4020 /* Check the step (evolution) of the load in LOOP, and record
4021 whether it's invariant. */
4022 step = vect_dr_behavior (dr)->step;
4023 if (integer_zerop (step))
4024 *inv_p = true;
4025 else
4026 *inv_p = false;
4028 /* Create an expression for the first address accessed by this load
4029 in LOOP. */
4030 base_name = get_name (DR_BASE_ADDRESS (dr));
4032 if (dump_enabled_p ())
4034 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4035 dump_printf_loc (MSG_NOTE, vect_location,
4036 "create %s-pointer variable to type: ",
4037 get_tree_code_name (TREE_CODE (aggr_type)));
4038 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4039 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4040 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4041 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4042 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4043 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4044 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4045 else
4046 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4047 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4048 dump_printf (MSG_NOTE, "\n");
4051 /* (1) Create the new aggregate-pointer variable.
4052 Vector and array types inherit the alias set of their component
4053 type by default so we need to use a ref-all pointer if the data
4054 reference does not conflict with the created aggregated data
4055 reference because it is not addressable. */
4056 bool need_ref_all = false;
4057 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4058 get_alias_set (DR_REF (dr))))
4059 need_ref_all = true;
4060 /* Likewise for any of the data references in the stmt group. */
4061 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4063 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4066 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4067 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4068 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4069 get_alias_set (DR_REF (sdr))))
4071 need_ref_all = true;
4072 break;
4074 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4076 while (orig_stmt);
4078 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4079 need_ref_all);
4080 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4083 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4084 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4085 def-use update cycles for the pointer: one relative to the outer-loop
4086 (LOOP), which is what steps (3) and (4) below do. The other is relative
4087 to the inner-loop (which is the inner-most loop containing the dataref),
4088 and this is done be step (5) below.
4090 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4091 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4092 redundant. Steps (3),(4) create the following:
4094 vp0 = &base_addr;
4095 LOOP: vp1 = phi(vp0,vp2)
4098 vp2 = vp1 + step
4099 goto LOOP
4101 If there is an inner-loop nested in loop, then step (5) will also be
4102 applied, and an additional update in the inner-loop will be created:
4104 vp0 = &base_addr;
4105 LOOP: vp1 = phi(vp0,vp2)
4107 inner: vp3 = phi(vp1,vp4)
4108 vp4 = vp3 + inner_step
4109 if () goto inner
4111 vp2 = vp1 + step
4112 if () goto LOOP */
4114 /* (2) Calculate the initial address of the aggregate-pointer, and set
4115 the aggregate-pointer to point to it before the loop. */
4117 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4119 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4120 offset, byte_offset);
4121 if (new_stmt_list)
4123 if (pe)
4125 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4126 gcc_assert (!new_bb);
4128 else
4129 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4132 *initial_address = new_temp;
4133 aggr_ptr_init = new_temp;
4135 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4136 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4137 inner-loop nested in LOOP (during outer-loop vectorization). */
4139 /* No update in loop is required. */
4140 if (only_init && (!loop_vinfo || at_loop == loop))
4141 aptr = aggr_ptr_init;
4142 else
4144 /* The step of the aggregate pointer is the type size. */
4145 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4146 /* One exception to the above is when the scalar step of the load in
4147 LOOP is zero. In this case the step here is also zero. */
4148 if (*inv_p)
4149 iv_step = size_zero_node;
4150 else if (tree_int_cst_sgn (step) == -1)
4151 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4153 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4155 create_iv (aggr_ptr_init,
4156 fold_convert (aggr_ptr_type, iv_step),
4157 aggr_ptr, loop, &incr_gsi, insert_after,
4158 &indx_before_incr, &indx_after_incr);
4159 incr = gsi_stmt (incr_gsi);
4160 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4162 /* Copy the points-to information if it exists. */
4163 if (DR_PTR_INFO (dr))
4165 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4166 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4168 if (ptr_incr)
4169 *ptr_incr = incr;
4171 aptr = indx_before_incr;
4174 if (!nested_in_vect_loop || only_init)
4175 return aptr;
4178 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4179 nested in LOOP, if exists. */
4181 gcc_assert (nested_in_vect_loop);
4182 if (!only_init)
4184 standard_iv_increment_position (containing_loop, &incr_gsi,
4185 &insert_after);
4186 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4187 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4188 &indx_after_incr);
4189 incr = gsi_stmt (incr_gsi);
4190 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4192 /* Copy the points-to information if it exists. */
4193 if (DR_PTR_INFO (dr))
4195 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4196 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4198 if (ptr_incr)
4199 *ptr_incr = incr;
4201 return indx_before_incr;
4203 else
4204 gcc_unreachable ();
4208 /* Function bump_vector_ptr
4210 Increment a pointer (to a vector type) by vector-size. If requested,
4211 i.e. if PTR-INCR is given, then also connect the new increment stmt
4212 to the existing def-use update-chain of the pointer, by modifying
4213 the PTR_INCR as illustrated below:
4215 The pointer def-use update-chain before this function:
4216 DATAREF_PTR = phi (p_0, p_2)
4217 ....
4218 PTR_INCR: p_2 = DATAREF_PTR + step
4220 The pointer def-use update-chain after this function:
4221 DATAREF_PTR = phi (p_0, p_2)
4222 ....
4223 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4224 ....
4225 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4227 Input:
4228 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4229 in the loop.
4230 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4231 the loop. The increment amount across iterations is expected
4232 to be vector_size.
4233 BSI - location where the new update stmt is to be placed.
4234 STMT - the original scalar memory-access stmt that is being vectorized.
4235 BUMP - optional. The offset by which to bump the pointer. If not given,
4236 the offset is assumed to be vector_size.
4238 Output: Return NEW_DATAREF_PTR as illustrated above.
4242 tree
4243 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4244 gimple *stmt, tree bump)
4246 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4247 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4248 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4249 tree update = TYPE_SIZE_UNIT (vectype);
4250 gassign *incr_stmt;
4251 ssa_op_iter iter;
4252 use_operand_p use_p;
4253 tree new_dataref_ptr;
4255 if (bump)
4256 update = bump;
4258 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4259 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4260 else
4261 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4262 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4263 dataref_ptr, update);
4264 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4266 /* Copy the points-to information if it exists. */
4267 if (DR_PTR_INFO (dr))
4269 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4270 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4273 if (!ptr_incr)
4274 return new_dataref_ptr;
4276 /* Update the vector-pointer's cross-iteration increment. */
4277 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4279 tree use = USE_FROM_PTR (use_p);
4281 if (use == dataref_ptr)
4282 SET_USE (use_p, new_dataref_ptr);
4283 else
4284 gcc_assert (tree_int_cst_compare (use, update) == 0);
4287 return new_dataref_ptr;
4291 /* Function vect_create_destination_var.
4293 Create a new temporary of type VECTYPE. */
4295 tree
4296 vect_create_destination_var (tree scalar_dest, tree vectype)
4298 tree vec_dest;
4299 const char *name;
4300 char *new_name;
4301 tree type;
4302 enum vect_var_kind kind;
4304 kind = vectype
4305 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4306 ? vect_mask_var
4307 : vect_simple_var
4308 : vect_scalar_var;
4309 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4311 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4313 name = get_name (scalar_dest);
4314 if (name)
4315 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4316 else
4317 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4318 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4319 free (new_name);
4321 return vec_dest;
4324 /* Function vect_grouped_store_supported.
4326 Returns TRUE if interleave high and interleave low permutations
4327 are supported, and FALSE otherwise. */
4329 bool
4330 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4332 machine_mode mode = TYPE_MODE (vectype);
4334 /* vect_permute_store_chain requires the group size to be equal to 3 or
4335 be a power of two. */
4336 if (count != 3 && exact_log2 (count) == -1)
4338 if (dump_enabled_p ())
4339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4340 "the size of the group of accesses"
4341 " is not a power of 2 or not eqaul to 3\n");
4342 return false;
4345 /* Check that the permutation is supported. */
4346 if (VECTOR_MODE_P (mode))
4348 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4349 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4351 if (count == 3)
4353 unsigned int j0 = 0, j1 = 0, j2 = 0;
4354 unsigned int i, j;
4356 for (j = 0; j < 3; j++)
4358 int nelt0 = ((3 - j) * nelt) % 3;
4359 int nelt1 = ((3 - j) * nelt + 1) % 3;
4360 int nelt2 = ((3 - j) * nelt + 2) % 3;
4361 for (i = 0; i < nelt; i++)
4363 if (3 * i + nelt0 < nelt)
4364 sel[3 * i + nelt0] = j0++;
4365 if (3 * i + nelt1 < nelt)
4366 sel[3 * i + nelt1] = nelt + j1++;
4367 if (3 * i + nelt2 < nelt)
4368 sel[3 * i + nelt2] = 0;
4370 if (!can_vec_perm_p (mode, false, sel))
4372 if (dump_enabled_p ())
4373 dump_printf (MSG_MISSED_OPTIMIZATION,
4374 "permutaion op not supported by target.\n");
4375 return false;
4378 for (i = 0; i < nelt; i++)
4380 if (3 * i + nelt0 < nelt)
4381 sel[3 * i + nelt0] = 3 * i + nelt0;
4382 if (3 * i + nelt1 < nelt)
4383 sel[3 * i + nelt1] = 3 * i + nelt1;
4384 if (3 * i + nelt2 < nelt)
4385 sel[3 * i + nelt2] = nelt + j2++;
4387 if (!can_vec_perm_p (mode, false, sel))
4389 if (dump_enabled_p ())
4390 dump_printf (MSG_MISSED_OPTIMIZATION,
4391 "permutaion op not supported by target.\n");
4392 return false;
4395 return true;
4397 else
4399 /* If length is not equal to 3 then only power of 2 is supported. */
4400 gcc_assert (pow2p_hwi (count));
4402 for (i = 0; i < nelt / 2; i++)
4404 sel[i * 2] = i;
4405 sel[i * 2 + 1] = i + nelt;
4407 if (can_vec_perm_p (mode, false, sel))
4409 for (i = 0; i < nelt; i++)
4410 sel[i] += nelt / 2;
4411 if (can_vec_perm_p (mode, false, sel))
4412 return true;
4417 if (dump_enabled_p ())
4418 dump_printf (MSG_MISSED_OPTIMIZATION,
4419 "permutaion op not supported by target.\n");
4420 return false;
4424 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4425 type VECTYPE. */
4427 bool
4428 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4430 return vect_lanes_optab_supported_p ("vec_store_lanes",
4431 vec_store_lanes_optab,
4432 vectype, count);
4436 /* Function vect_permute_store_chain.
4438 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4439 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4440 the data correctly for the stores. Return the final references for stores
4441 in RESULT_CHAIN.
4443 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4444 The input is 4 vectors each containing 8 elements. We assign a number to
4445 each element, the input sequence is:
4447 1st vec: 0 1 2 3 4 5 6 7
4448 2nd vec: 8 9 10 11 12 13 14 15
4449 3rd vec: 16 17 18 19 20 21 22 23
4450 4th vec: 24 25 26 27 28 29 30 31
4452 The output sequence should be:
4454 1st vec: 0 8 16 24 1 9 17 25
4455 2nd vec: 2 10 18 26 3 11 19 27
4456 3rd vec: 4 12 20 28 5 13 21 30
4457 4th vec: 6 14 22 30 7 15 23 31
4459 i.e., we interleave the contents of the four vectors in their order.
4461 We use interleave_high/low instructions to create such output. The input of
4462 each interleave_high/low operation is two vectors:
4463 1st vec 2nd vec
4464 0 1 2 3 4 5 6 7
4465 the even elements of the result vector are obtained left-to-right from the
4466 high/low elements of the first vector. The odd elements of the result are
4467 obtained left-to-right from the high/low elements of the second vector.
4468 The output of interleave_high will be: 0 4 1 5
4469 and of interleave_low: 2 6 3 7
4472 The permutation is done in log LENGTH stages. In each stage interleave_high
4473 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4474 where the first argument is taken from the first half of DR_CHAIN and the
4475 second argument from it's second half.
4476 In our example,
4478 I1: interleave_high (1st vec, 3rd vec)
4479 I2: interleave_low (1st vec, 3rd vec)
4480 I3: interleave_high (2nd vec, 4th vec)
4481 I4: interleave_low (2nd vec, 4th vec)
4483 The output for the first stage is:
4485 I1: 0 16 1 17 2 18 3 19
4486 I2: 4 20 5 21 6 22 7 23
4487 I3: 8 24 9 25 10 26 11 27
4488 I4: 12 28 13 29 14 30 15 31
4490 The output of the second stage, i.e. the final result is:
4492 I1: 0 8 16 24 1 9 17 25
4493 I2: 2 10 18 26 3 11 19 27
4494 I3: 4 12 20 28 5 13 21 30
4495 I4: 6 14 22 30 7 15 23 31. */
4497 void
4498 vect_permute_store_chain (vec<tree> dr_chain,
4499 unsigned int length,
4500 gimple *stmt,
4501 gimple_stmt_iterator *gsi,
4502 vec<tree> *result_chain)
4504 tree vect1, vect2, high, low;
4505 gimple *perm_stmt;
4506 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4507 tree perm_mask_low, perm_mask_high;
4508 tree data_ref;
4509 tree perm3_mask_low, perm3_mask_high;
4510 unsigned int i, n, log_length = exact_log2 (length);
4511 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4512 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4514 result_chain->quick_grow (length);
4515 memcpy (result_chain->address (), dr_chain.address (),
4516 length * sizeof (tree));
4518 if (length == 3)
4520 unsigned int j0 = 0, j1 = 0, j2 = 0;
4522 for (j = 0; j < 3; j++)
4524 int nelt0 = ((3 - j) * nelt) % 3;
4525 int nelt1 = ((3 - j) * nelt + 1) % 3;
4526 int nelt2 = ((3 - j) * nelt + 2) % 3;
4528 for (i = 0; i < nelt; i++)
4530 if (3 * i + nelt0 < nelt)
4531 sel[3 * i + nelt0] = j0++;
4532 if (3 * i + nelt1 < nelt)
4533 sel[3 * i + nelt1] = nelt + j1++;
4534 if (3 * i + nelt2 < nelt)
4535 sel[3 * i + nelt2] = 0;
4537 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4539 for (i = 0; i < nelt; i++)
4541 if (3 * i + nelt0 < nelt)
4542 sel[3 * i + nelt0] = 3 * i + nelt0;
4543 if (3 * i + nelt1 < nelt)
4544 sel[3 * i + nelt1] = 3 * i + nelt1;
4545 if (3 * i + nelt2 < nelt)
4546 sel[3 * i + nelt2] = nelt + j2++;
4548 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4550 vect1 = dr_chain[0];
4551 vect2 = dr_chain[1];
4553 /* Create interleaving stmt:
4554 low = VEC_PERM_EXPR <vect1, vect2,
4555 {j, nelt, *, j + 1, nelt + j + 1, *,
4556 j + 2, nelt + j + 2, *, ...}> */
4557 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4558 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4559 vect2, perm3_mask_low);
4560 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4562 vect1 = data_ref;
4563 vect2 = dr_chain[2];
4564 /* Create interleaving stmt:
4565 low = VEC_PERM_EXPR <vect1, vect2,
4566 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4567 6, 7, nelt + j + 2, ...}> */
4568 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4569 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4570 vect2, perm3_mask_high);
4571 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4572 (*result_chain)[j] = data_ref;
4575 else
4577 /* If length is not equal to 3 then only power of 2 is supported. */
4578 gcc_assert (pow2p_hwi (length));
4580 for (i = 0, n = nelt / 2; i < n; i++)
4582 sel[i * 2] = i;
4583 sel[i * 2 + 1] = i + nelt;
4585 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4587 for (i = 0; i < nelt; i++)
4588 sel[i] += nelt / 2;
4589 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4591 for (i = 0, n = log_length; i < n; i++)
4593 for (j = 0; j < length/2; j++)
4595 vect1 = dr_chain[j];
4596 vect2 = dr_chain[j+length/2];
4598 /* Create interleaving stmt:
4599 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4600 ...}> */
4601 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4602 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4603 vect2, perm_mask_high);
4604 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4605 (*result_chain)[2*j] = high;
4607 /* Create interleaving stmt:
4608 low = VEC_PERM_EXPR <vect1, vect2,
4609 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4610 ...}> */
4611 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4612 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4613 vect2, perm_mask_low);
4614 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4615 (*result_chain)[2*j+1] = low;
4617 memcpy (dr_chain.address (), result_chain->address (),
4618 length * sizeof (tree));
4623 /* Function vect_setup_realignment
4625 This function is called when vectorizing an unaligned load using
4626 the dr_explicit_realign[_optimized] scheme.
4627 This function generates the following code at the loop prolog:
4629 p = initial_addr;
4630 x msq_init = *(floor(p)); # prolog load
4631 realignment_token = call target_builtin;
4632 loop:
4633 x msq = phi (msq_init, ---)
4635 The stmts marked with x are generated only for the case of
4636 dr_explicit_realign_optimized.
4638 The code above sets up a new (vector) pointer, pointing to the first
4639 location accessed by STMT, and a "floor-aligned" load using that pointer.
4640 It also generates code to compute the "realignment-token" (if the relevant
4641 target hook was defined), and creates a phi-node at the loop-header bb
4642 whose arguments are the result of the prolog-load (created by this
4643 function) and the result of a load that takes place in the loop (to be
4644 created by the caller to this function).
4646 For the case of dr_explicit_realign_optimized:
4647 The caller to this function uses the phi-result (msq) to create the
4648 realignment code inside the loop, and sets up the missing phi argument,
4649 as follows:
4650 loop:
4651 msq = phi (msq_init, lsq)
4652 lsq = *(floor(p')); # load in loop
4653 result = realign_load (msq, lsq, realignment_token);
4655 For the case of dr_explicit_realign:
4656 loop:
4657 msq = *(floor(p)); # load in loop
4658 p' = p + (VS-1);
4659 lsq = *(floor(p')); # load in loop
4660 result = realign_load (msq, lsq, realignment_token);
4662 Input:
4663 STMT - (scalar) load stmt to be vectorized. This load accesses
4664 a memory location that may be unaligned.
4665 BSI - place where new code is to be inserted.
4666 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4667 is used.
4669 Output:
4670 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4671 target hook, if defined.
4672 Return value - the result of the loop-header phi node. */
4674 tree
4675 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4676 tree *realignment_token,
4677 enum dr_alignment_support alignment_support_scheme,
4678 tree init_addr,
4679 struct loop **at_loop)
4681 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4682 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4683 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4684 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4685 struct loop *loop = NULL;
4686 edge pe = NULL;
4687 tree scalar_dest = gimple_assign_lhs (stmt);
4688 tree vec_dest;
4689 gimple *inc;
4690 tree ptr;
4691 tree data_ref;
4692 basic_block new_bb;
4693 tree msq_init = NULL_TREE;
4694 tree new_temp;
4695 gphi *phi_stmt;
4696 tree msq = NULL_TREE;
4697 gimple_seq stmts = NULL;
4698 bool inv_p;
4699 bool compute_in_loop = false;
4700 bool nested_in_vect_loop = false;
4701 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4702 struct loop *loop_for_initial_load = NULL;
4704 if (loop_vinfo)
4706 loop = LOOP_VINFO_LOOP (loop_vinfo);
4707 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4710 gcc_assert (alignment_support_scheme == dr_explicit_realign
4711 || alignment_support_scheme == dr_explicit_realign_optimized);
4713 /* We need to generate three things:
4714 1. the misalignment computation
4715 2. the extra vector load (for the optimized realignment scheme).
4716 3. the phi node for the two vectors from which the realignment is
4717 done (for the optimized realignment scheme). */
4719 /* 1. Determine where to generate the misalignment computation.
4721 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4722 calculation will be generated by this function, outside the loop (in the
4723 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4724 caller, inside the loop.
4726 Background: If the misalignment remains fixed throughout the iterations of
4727 the loop, then both realignment schemes are applicable, and also the
4728 misalignment computation can be done outside LOOP. This is because we are
4729 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4730 are a multiple of VS (the Vector Size), and therefore the misalignment in
4731 different vectorized LOOP iterations is always the same.
4732 The problem arises only if the memory access is in an inner-loop nested
4733 inside LOOP, which is now being vectorized using outer-loop vectorization.
4734 This is the only case when the misalignment of the memory access may not
4735 remain fixed throughout the iterations of the inner-loop (as explained in
4736 detail in vect_supportable_dr_alignment). In this case, not only is the
4737 optimized realignment scheme not applicable, but also the misalignment
4738 computation (and generation of the realignment token that is passed to
4739 REALIGN_LOAD) have to be done inside the loop.
4741 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4742 or not, which in turn determines if the misalignment is computed inside
4743 the inner-loop, or outside LOOP. */
4745 if (init_addr != NULL_TREE || !loop_vinfo)
4747 compute_in_loop = true;
4748 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4752 /* 2. Determine where to generate the extra vector load.
4754 For the optimized realignment scheme, instead of generating two vector
4755 loads in each iteration, we generate a single extra vector load in the
4756 preheader of the loop, and in each iteration reuse the result of the
4757 vector load from the previous iteration. In case the memory access is in
4758 an inner-loop nested inside LOOP, which is now being vectorized using
4759 outer-loop vectorization, we need to determine whether this initial vector
4760 load should be generated at the preheader of the inner-loop, or can be
4761 generated at the preheader of LOOP. If the memory access has no evolution
4762 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4763 to be generated inside LOOP (in the preheader of the inner-loop). */
4765 if (nested_in_vect_loop)
4767 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4768 bool invariant_in_outerloop =
4769 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4770 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4772 else
4773 loop_for_initial_load = loop;
4774 if (at_loop)
4775 *at_loop = loop_for_initial_load;
4777 if (loop_for_initial_load)
4778 pe = loop_preheader_edge (loop_for_initial_load);
4780 /* 3. For the case of the optimized realignment, create the first vector
4781 load at the loop preheader. */
4783 if (alignment_support_scheme == dr_explicit_realign_optimized)
4785 /* Create msq_init = *(floor(p1)) in the loop preheader */
4786 gassign *new_stmt;
4788 gcc_assert (!compute_in_loop);
4789 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4790 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4791 NULL_TREE, &init_addr, NULL, &inc,
4792 true, &inv_p);
4793 if (TREE_CODE (ptr) == SSA_NAME)
4794 new_temp = copy_ssa_name (ptr);
4795 else
4796 new_temp = make_ssa_name (TREE_TYPE (ptr));
4797 new_stmt = gimple_build_assign
4798 (new_temp, BIT_AND_EXPR, ptr,
4799 build_int_cst (TREE_TYPE (ptr),
4800 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4801 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4802 gcc_assert (!new_bb);
4803 data_ref
4804 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4805 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4806 new_stmt = gimple_build_assign (vec_dest, data_ref);
4807 new_temp = make_ssa_name (vec_dest, new_stmt);
4808 gimple_assign_set_lhs (new_stmt, new_temp);
4809 if (pe)
4811 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4812 gcc_assert (!new_bb);
4814 else
4815 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4817 msq_init = gimple_assign_lhs (new_stmt);
4820 /* 4. Create realignment token using a target builtin, if available.
4821 It is done either inside the containing loop, or before LOOP (as
4822 determined above). */
4824 if (targetm.vectorize.builtin_mask_for_load)
4826 gcall *new_stmt;
4827 tree builtin_decl;
4829 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4830 if (!init_addr)
4832 /* Generate the INIT_ADDR computation outside LOOP. */
4833 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4834 NULL_TREE);
4835 if (loop)
4837 pe = loop_preheader_edge (loop);
4838 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4839 gcc_assert (!new_bb);
4841 else
4842 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4845 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4846 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4847 vec_dest =
4848 vect_create_destination_var (scalar_dest,
4849 gimple_call_return_type (new_stmt));
4850 new_temp = make_ssa_name (vec_dest, new_stmt);
4851 gimple_call_set_lhs (new_stmt, new_temp);
4853 if (compute_in_loop)
4854 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4855 else
4857 /* Generate the misalignment computation outside LOOP. */
4858 pe = loop_preheader_edge (loop);
4859 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4860 gcc_assert (!new_bb);
4863 *realignment_token = gimple_call_lhs (new_stmt);
4865 /* The result of the CALL_EXPR to this builtin is determined from
4866 the value of the parameter and no global variables are touched
4867 which makes the builtin a "const" function. Requiring the
4868 builtin to have the "const" attribute makes it unnecessary
4869 to call mark_call_clobbered. */
4870 gcc_assert (TREE_READONLY (builtin_decl));
4873 if (alignment_support_scheme == dr_explicit_realign)
4874 return msq;
4876 gcc_assert (!compute_in_loop);
4877 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4880 /* 5. Create msq = phi <msq_init, lsq> in loop */
4882 pe = loop_preheader_edge (containing_loop);
4883 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4884 msq = make_ssa_name (vec_dest);
4885 phi_stmt = create_phi_node (msq, containing_loop->header);
4886 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4888 return msq;
4892 /* Function vect_grouped_load_supported.
4894 COUNT is the size of the load group (the number of statements plus the
4895 number of gaps). SINGLE_ELEMENT_P is true if there is actually
4896 only one statement, with a gap of COUNT - 1.
4898 Returns true if a suitable permute exists. */
4900 bool
4901 vect_grouped_load_supported (tree vectype, bool single_element_p,
4902 unsigned HOST_WIDE_INT count)
4904 machine_mode mode = TYPE_MODE (vectype);
4906 /* If this is single-element interleaving with an element distance
4907 that leaves unused vector loads around punt - we at least create
4908 very sub-optimal code in that case (and blow up memory,
4909 see PR65518). */
4910 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
4912 if (dump_enabled_p ())
4913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4914 "single-element interleaving not supported "
4915 "for not adjacent vector loads\n");
4916 return false;
4919 /* vect_permute_load_chain requires the group size to be equal to 3 or
4920 be a power of two. */
4921 if (count != 3 && exact_log2 (count) == -1)
4923 if (dump_enabled_p ())
4924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4925 "the size of the group of accesses"
4926 " is not a power of 2 or not equal to 3\n");
4927 return false;
4930 /* Check that the permutation is supported. */
4931 if (VECTOR_MODE_P (mode))
4933 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4934 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4936 if (count == 3)
4938 unsigned int k;
4939 for (k = 0; k < 3; k++)
4941 for (i = 0; i < nelt; i++)
4942 if (3 * i + k < 2 * nelt)
4943 sel[i] = 3 * i + k;
4944 else
4945 sel[i] = 0;
4946 if (!can_vec_perm_p (mode, false, sel))
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4950 "shuffle of 3 loads is not supported by"
4951 " target\n");
4952 return false;
4954 for (i = 0, j = 0; i < nelt; i++)
4955 if (3 * i + k < 2 * nelt)
4956 sel[i] = i;
4957 else
4958 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4959 if (!can_vec_perm_p (mode, false, sel))
4961 if (dump_enabled_p ())
4962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4963 "shuffle of 3 loads is not supported by"
4964 " target\n");
4965 return false;
4968 return true;
4970 else
4972 /* If length is not equal to 3 then only power of 2 is supported. */
4973 gcc_assert (pow2p_hwi (count));
4974 for (i = 0; i < nelt; i++)
4975 sel[i] = i * 2;
4976 if (can_vec_perm_p (mode, false, sel))
4978 for (i = 0; i < nelt; i++)
4979 sel[i] = i * 2 + 1;
4980 if (can_vec_perm_p (mode, false, sel))
4981 return true;
4986 if (dump_enabled_p ())
4987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4988 "extract even/odd not supported by target\n");
4989 return false;
4992 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4993 type VECTYPE. */
4995 bool
4996 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4998 return vect_lanes_optab_supported_p ("vec_load_lanes",
4999 vec_load_lanes_optab,
5000 vectype, count);
5003 /* Function vect_permute_load_chain.
5005 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5006 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5007 the input data correctly. Return the final references for loads in
5008 RESULT_CHAIN.
5010 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5011 The input is 4 vectors each containing 8 elements. We assign a number to each
5012 element, the input sequence is:
5014 1st vec: 0 1 2 3 4 5 6 7
5015 2nd vec: 8 9 10 11 12 13 14 15
5016 3rd vec: 16 17 18 19 20 21 22 23
5017 4th vec: 24 25 26 27 28 29 30 31
5019 The output sequence should be:
5021 1st vec: 0 4 8 12 16 20 24 28
5022 2nd vec: 1 5 9 13 17 21 25 29
5023 3rd vec: 2 6 10 14 18 22 26 30
5024 4th vec: 3 7 11 15 19 23 27 31
5026 i.e., the first output vector should contain the first elements of each
5027 interleaving group, etc.
5029 We use extract_even/odd instructions to create such output. The input of
5030 each extract_even/odd operation is two vectors
5031 1st vec 2nd vec
5032 0 1 2 3 4 5 6 7
5034 and the output is the vector of extracted even/odd elements. The output of
5035 extract_even will be: 0 2 4 6
5036 and of extract_odd: 1 3 5 7
5039 The permutation is done in log LENGTH stages. In each stage extract_even
5040 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5041 their order. In our example,
5043 E1: extract_even (1st vec, 2nd vec)
5044 E2: extract_odd (1st vec, 2nd vec)
5045 E3: extract_even (3rd vec, 4th vec)
5046 E4: extract_odd (3rd vec, 4th vec)
5048 The output for the first stage will be:
5050 E1: 0 2 4 6 8 10 12 14
5051 E2: 1 3 5 7 9 11 13 15
5052 E3: 16 18 20 22 24 26 28 30
5053 E4: 17 19 21 23 25 27 29 31
5055 In order to proceed and create the correct sequence for the next stage (or
5056 for the correct output, if the second stage is the last one, as in our
5057 example), we first put the output of extract_even operation and then the
5058 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5059 The input for the second stage is:
5061 1st vec (E1): 0 2 4 6 8 10 12 14
5062 2nd vec (E3): 16 18 20 22 24 26 28 30
5063 3rd vec (E2): 1 3 5 7 9 11 13 15
5064 4th vec (E4): 17 19 21 23 25 27 29 31
5066 The output of the second stage:
5068 E1: 0 4 8 12 16 20 24 28
5069 E2: 2 6 10 14 18 22 26 30
5070 E3: 1 5 9 13 17 21 25 29
5071 E4: 3 7 11 15 19 23 27 31
5073 And RESULT_CHAIN after reordering:
5075 1st vec (E1): 0 4 8 12 16 20 24 28
5076 2nd vec (E3): 1 5 9 13 17 21 25 29
5077 3rd vec (E2): 2 6 10 14 18 22 26 30
5078 4th vec (E4): 3 7 11 15 19 23 27 31. */
5080 static void
5081 vect_permute_load_chain (vec<tree> dr_chain,
5082 unsigned int length,
5083 gimple *stmt,
5084 gimple_stmt_iterator *gsi,
5085 vec<tree> *result_chain)
5087 tree data_ref, first_vect, second_vect;
5088 tree perm_mask_even, perm_mask_odd;
5089 tree perm3_mask_low, perm3_mask_high;
5090 gimple *perm_stmt;
5091 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5092 unsigned int i, j, log_length = exact_log2 (length);
5093 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5094 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5096 result_chain->quick_grow (length);
5097 memcpy (result_chain->address (), dr_chain.address (),
5098 length * sizeof (tree));
5100 if (length == 3)
5102 unsigned int k;
5104 for (k = 0; k < 3; k++)
5106 for (i = 0; i < nelt; i++)
5107 if (3 * i + k < 2 * nelt)
5108 sel[i] = 3 * i + k;
5109 else
5110 sel[i] = 0;
5111 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5113 for (i = 0, j = 0; i < nelt; i++)
5114 if (3 * i + k < 2 * nelt)
5115 sel[i] = i;
5116 else
5117 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5119 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5121 first_vect = dr_chain[0];
5122 second_vect = dr_chain[1];
5124 /* Create interleaving stmt (low part of):
5125 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5126 ...}> */
5127 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5128 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5129 second_vect, perm3_mask_low);
5130 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5132 /* Create interleaving stmt (high part of):
5133 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5134 ...}> */
5135 first_vect = data_ref;
5136 second_vect = dr_chain[2];
5137 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5138 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5139 second_vect, perm3_mask_high);
5140 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5141 (*result_chain)[k] = data_ref;
5144 else
5146 /* If length is not equal to 3 then only power of 2 is supported. */
5147 gcc_assert (pow2p_hwi (length));
5149 for (i = 0; i < nelt; ++i)
5150 sel[i] = i * 2;
5151 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5153 for (i = 0; i < nelt; ++i)
5154 sel[i] = i * 2 + 1;
5155 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5157 for (i = 0; i < log_length; i++)
5159 for (j = 0; j < length; j += 2)
5161 first_vect = dr_chain[j];
5162 second_vect = dr_chain[j+1];
5164 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5165 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5166 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5167 first_vect, second_vect,
5168 perm_mask_even);
5169 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5170 (*result_chain)[j/2] = data_ref;
5172 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5173 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5174 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5175 first_vect, second_vect,
5176 perm_mask_odd);
5177 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5178 (*result_chain)[j/2+length/2] = data_ref;
5180 memcpy (dr_chain.address (), result_chain->address (),
5181 length * sizeof (tree));
5186 /* Function vect_shift_permute_load_chain.
5188 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5189 sequence of stmts to reorder the input data accordingly.
5190 Return the final references for loads in RESULT_CHAIN.
5191 Return true if successed, false otherwise.
5193 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5194 The input is 3 vectors each containing 8 elements. We assign a
5195 number to each element, the input sequence is:
5197 1st vec: 0 1 2 3 4 5 6 7
5198 2nd vec: 8 9 10 11 12 13 14 15
5199 3rd vec: 16 17 18 19 20 21 22 23
5201 The output sequence should be:
5203 1st vec: 0 3 6 9 12 15 18 21
5204 2nd vec: 1 4 7 10 13 16 19 22
5205 3rd vec: 2 5 8 11 14 17 20 23
5207 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5209 First we shuffle all 3 vectors to get correct elements order:
5211 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5212 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5213 3rd vec: (16 19 22) (17 20 23) (18 21)
5215 Next we unite and shift vector 3 times:
5217 1st step:
5218 shift right by 6 the concatenation of:
5219 "1st vec" and "2nd vec"
5220 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5221 "2nd vec" and "3rd vec"
5222 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5223 "3rd vec" and "1st vec"
5224 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5225 | New vectors |
5227 So that now new vectors are:
5229 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5230 2nd vec: (10 13) (16 19 22) (17 20 23)
5231 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5233 2nd step:
5234 shift right by 5 the concatenation of:
5235 "1st vec" and "3rd vec"
5236 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5237 "2nd vec" and "1st vec"
5238 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5239 "3rd vec" and "2nd vec"
5240 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5241 | New vectors |
5243 So that now new vectors are:
5245 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5246 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5247 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5249 3rd step:
5250 shift right by 5 the concatenation of:
5251 "1st vec" and "1st vec"
5252 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5253 shift right by 3 the concatenation of:
5254 "2nd vec" and "2nd vec"
5255 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5256 | New vectors |
5258 So that now all vectors are READY:
5259 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5260 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5261 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5263 This algorithm is faster than one in vect_permute_load_chain if:
5264 1. "shift of a concatination" is faster than general permutation.
5265 This is usually so.
5266 2. The TARGET machine can't execute vector instructions in parallel.
5267 This is because each step of the algorithm depends on previous.
5268 The algorithm in vect_permute_load_chain is much more parallel.
5270 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5273 static bool
5274 vect_shift_permute_load_chain (vec<tree> dr_chain,
5275 unsigned int length,
5276 gimple *stmt,
5277 gimple_stmt_iterator *gsi,
5278 vec<tree> *result_chain)
5280 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5281 tree perm2_mask1, perm2_mask2, perm3_mask;
5282 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5283 gimple *perm_stmt;
5285 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5286 unsigned int i;
5287 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5288 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5289 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5290 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5292 result_chain->quick_grow (length);
5293 memcpy (result_chain->address (), dr_chain.address (),
5294 length * sizeof (tree));
5296 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5298 unsigned int j, log_length = exact_log2 (length);
5299 for (i = 0; i < nelt / 2; ++i)
5300 sel[i] = i * 2;
5301 for (i = 0; i < nelt / 2; ++i)
5302 sel[nelt / 2 + i] = i * 2 + 1;
5303 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5305 if (dump_enabled_p ())
5306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5307 "shuffle of 2 fields structure is not \
5308 supported by target\n");
5309 return false;
5311 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5313 for (i = 0; i < nelt / 2; ++i)
5314 sel[i] = i * 2 + 1;
5315 for (i = 0; i < nelt / 2; ++i)
5316 sel[nelt / 2 + i] = i * 2;
5317 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5319 if (dump_enabled_p ())
5320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5321 "shuffle of 2 fields structure is not \
5322 supported by target\n");
5323 return false;
5325 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5327 /* Generating permutation constant to shift all elements.
5328 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5329 for (i = 0; i < nelt; i++)
5330 sel[i] = nelt / 2 + i;
5331 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5333 if (dump_enabled_p ())
5334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5335 "shift permutation is not supported by target\n");
5336 return false;
5338 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5340 /* Generating permutation constant to select vector from 2.
5341 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5342 for (i = 0; i < nelt / 2; i++)
5343 sel[i] = i;
5344 for (i = nelt / 2; i < nelt; i++)
5345 sel[i] = nelt + i;
5346 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5348 if (dump_enabled_p ())
5349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5350 "select is not supported by target\n");
5351 return false;
5353 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5355 for (i = 0; i < log_length; i++)
5357 for (j = 0; j < length; j += 2)
5359 first_vect = dr_chain[j];
5360 second_vect = dr_chain[j + 1];
5362 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5363 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5364 first_vect, first_vect,
5365 perm2_mask1);
5366 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5367 vect[0] = data_ref;
5369 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5370 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5371 second_vect, second_vect,
5372 perm2_mask2);
5373 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5374 vect[1] = data_ref;
5376 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5377 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5378 vect[0], vect[1], shift1_mask);
5379 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5380 (*result_chain)[j/2 + length/2] = data_ref;
5382 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5383 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5384 vect[0], vect[1], select_mask);
5385 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5386 (*result_chain)[j/2] = data_ref;
5388 memcpy (dr_chain.address (), result_chain->address (),
5389 length * sizeof (tree));
5391 return true;
5393 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5395 unsigned int k = 0, l = 0;
5397 /* Generating permutation constant to get all elements in rigth order.
5398 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5399 for (i = 0; i < nelt; i++)
5401 if (3 * k + (l % 3) >= nelt)
5403 k = 0;
5404 l += (3 - (nelt % 3));
5406 sel[i] = 3 * k + (l % 3);
5407 k++;
5409 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5411 if (dump_enabled_p ())
5412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5413 "shuffle of 3 fields structure is not \
5414 supported by target\n");
5415 return false;
5417 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5419 /* Generating permutation constant to shift all elements.
5420 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5421 for (i = 0; i < nelt; i++)
5422 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5423 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5425 if (dump_enabled_p ())
5426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5427 "shift permutation is not supported by target\n");
5428 return false;
5430 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5432 /* Generating permutation constant to shift all elements.
5433 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5434 for (i = 0; i < nelt; i++)
5435 sel[i] = 2 * (nelt / 3) + 1 + i;
5436 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5438 if (dump_enabled_p ())
5439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5440 "shift permutation is not supported by target\n");
5441 return false;
5443 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5445 /* Generating permutation constant to shift all elements.
5446 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5447 for (i = 0; i < nelt; i++)
5448 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5449 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5451 if (dump_enabled_p ())
5452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5453 "shift permutation is not supported by target\n");
5454 return false;
5456 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5458 /* Generating permutation constant to shift all elements.
5459 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5460 for (i = 0; i < nelt; i++)
5461 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5462 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5464 if (dump_enabled_p ())
5465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5466 "shift permutation is not supported by target\n");
5467 return false;
5469 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5471 for (k = 0; k < 3; k++)
5473 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5474 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5475 dr_chain[k], dr_chain[k],
5476 perm3_mask);
5477 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5478 vect[k] = data_ref;
5481 for (k = 0; k < 3; k++)
5483 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5484 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5485 vect[k % 3], vect[(k + 1) % 3],
5486 shift1_mask);
5487 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5488 vect_shift[k] = data_ref;
5491 for (k = 0; k < 3; k++)
5493 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5494 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5495 vect_shift[(4 - k) % 3],
5496 vect_shift[(3 - k) % 3],
5497 shift2_mask);
5498 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5499 vect[k] = data_ref;
5502 (*result_chain)[3 - (nelt % 3)] = vect[2];
5504 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5505 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5506 vect[0], shift3_mask);
5507 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5508 (*result_chain)[nelt % 3] = data_ref;
5510 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5511 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5512 vect[1], shift4_mask);
5513 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5514 (*result_chain)[0] = data_ref;
5515 return true;
5517 return false;
5520 /* Function vect_transform_grouped_load.
5522 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5523 to perform their permutation and ascribe the result vectorized statements to
5524 the scalar statements.
5527 void
5528 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5529 gimple_stmt_iterator *gsi)
5531 machine_mode mode;
5532 vec<tree> result_chain = vNULL;
5534 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5535 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5536 vectors, that are ready for vector computation. */
5537 result_chain.create (size);
5539 /* If reassociation width for vector type is 2 or greater target machine can
5540 execute 2 or more vector instructions in parallel. Otherwise try to
5541 get chain for loads group using vect_shift_permute_load_chain. */
5542 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5543 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5544 || pow2p_hwi (size)
5545 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5546 gsi, &result_chain))
5547 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5548 vect_record_grouped_load_vectors (stmt, result_chain);
5549 result_chain.release ();
5552 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5553 generated as part of the vectorization of STMT. Assign the statement
5554 for each vector to the associated scalar statement. */
5556 void
5557 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5559 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5560 gimple *next_stmt, *new_stmt;
5561 unsigned int i, gap_count;
5562 tree tmp_data_ref;
5564 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5565 Since we scan the chain starting from it's first node, their order
5566 corresponds the order of data-refs in RESULT_CHAIN. */
5567 next_stmt = first_stmt;
5568 gap_count = 1;
5569 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5571 if (!next_stmt)
5572 break;
5574 /* Skip the gaps. Loads created for the gaps will be removed by dead
5575 code elimination pass later. No need to check for the first stmt in
5576 the group, since it always exists.
5577 GROUP_GAP is the number of steps in elements from the previous
5578 access (if there is no gap GROUP_GAP is 1). We skip loads that
5579 correspond to the gaps. */
5580 if (next_stmt != first_stmt
5581 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5583 gap_count++;
5584 continue;
5587 while (next_stmt)
5589 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5590 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5591 copies, and we put the new vector statement in the first available
5592 RELATED_STMT. */
5593 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5594 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5595 else
5597 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5599 gimple *prev_stmt =
5600 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5601 gimple *rel_stmt =
5602 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5603 while (rel_stmt)
5605 prev_stmt = rel_stmt;
5606 rel_stmt =
5607 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5610 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5611 new_stmt;
5615 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5616 gap_count = 1;
5617 /* If NEXT_STMT accesses the same DR as the previous statement,
5618 put the same TMP_DATA_REF as its vectorized statement; otherwise
5619 get the next data-ref from RESULT_CHAIN. */
5620 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5621 break;
5626 /* Function vect_force_dr_alignment_p.
5628 Returns whether the alignment of a DECL can be forced to be aligned
5629 on ALIGNMENT bit boundary. */
5631 bool
5632 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5634 if (!VAR_P (decl))
5635 return false;
5637 if (decl_in_symtab_p (decl)
5638 && !symtab_node::get (decl)->can_increase_alignment_p ())
5639 return false;
5641 if (TREE_STATIC (decl))
5642 return (alignment <= MAX_OFILE_ALIGNMENT);
5643 else
5644 return (alignment <= MAX_STACK_ALIGNMENT);
5648 /* Return whether the data reference DR is supported with respect to its
5649 alignment.
5650 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5651 it is aligned, i.e., check if it is possible to vectorize it with different
5652 alignment. */
5654 enum dr_alignment_support
5655 vect_supportable_dr_alignment (struct data_reference *dr,
5656 bool check_aligned_accesses)
5658 gimple *stmt = DR_STMT (dr);
5659 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5660 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5661 machine_mode mode = TYPE_MODE (vectype);
5662 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5663 struct loop *vect_loop = NULL;
5664 bool nested_in_vect_loop = false;
5666 if (aligned_access_p (dr) && !check_aligned_accesses)
5667 return dr_aligned;
5669 /* For now assume all conditional loads/stores support unaligned
5670 access without any special code. */
5671 if (is_gimple_call (stmt)
5672 && gimple_call_internal_p (stmt)
5673 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5674 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5675 return dr_unaligned_supported;
5677 if (loop_vinfo)
5679 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5680 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5683 /* Possibly unaligned access. */
5685 /* We can choose between using the implicit realignment scheme (generating
5686 a misaligned_move stmt) and the explicit realignment scheme (generating
5687 aligned loads with a REALIGN_LOAD). There are two variants to the
5688 explicit realignment scheme: optimized, and unoptimized.
5689 We can optimize the realignment only if the step between consecutive
5690 vector loads is equal to the vector size. Since the vector memory
5691 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5692 is guaranteed that the misalignment amount remains the same throughout the
5693 execution of the vectorized loop. Therefore, we can create the
5694 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5695 at the loop preheader.
5697 However, in the case of outer-loop vectorization, when vectorizing a
5698 memory access in the inner-loop nested within the LOOP that is now being
5699 vectorized, while it is guaranteed that the misalignment of the
5700 vectorized memory access will remain the same in different outer-loop
5701 iterations, it is *not* guaranteed that is will remain the same throughout
5702 the execution of the inner-loop. This is because the inner-loop advances
5703 with the original scalar step (and not in steps of VS). If the inner-loop
5704 step happens to be a multiple of VS, then the misalignment remains fixed
5705 and we can use the optimized realignment scheme. For example:
5707 for (i=0; i<N; i++)
5708 for (j=0; j<M; j++)
5709 s += a[i+j];
5711 When vectorizing the i-loop in the above example, the step between
5712 consecutive vector loads is 1, and so the misalignment does not remain
5713 fixed across the execution of the inner-loop, and the realignment cannot
5714 be optimized (as illustrated in the following pseudo vectorized loop):
5716 for (i=0; i<N; i+=4)
5717 for (j=0; j<M; j++){
5718 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5719 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5720 // (assuming that we start from an aligned address).
5723 We therefore have to use the unoptimized realignment scheme:
5725 for (i=0; i<N; i+=4)
5726 for (j=k; j<M; j+=4)
5727 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5728 // that the misalignment of the initial address is
5729 // 0).
5731 The loop can then be vectorized as follows:
5733 for (k=0; k<4; k++){
5734 rt = get_realignment_token (&vp[k]);
5735 for (i=0; i<N; i+=4){
5736 v1 = vp[i+k];
5737 for (j=k; j<M; j+=4){
5738 v2 = vp[i+j+VS-1];
5739 va = REALIGN_LOAD <v1,v2,rt>;
5740 vs += va;
5741 v1 = v2;
5744 } */
5746 if (DR_IS_READ (dr))
5748 bool is_packed = false;
5749 tree type = (TREE_TYPE (DR_REF (dr)));
5751 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5752 && (!targetm.vectorize.builtin_mask_for_load
5753 || targetm.vectorize.builtin_mask_for_load ()))
5755 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5757 /* If we are doing SLP then the accesses need not have the
5758 same alignment, instead it depends on the SLP group size. */
5759 if (loop_vinfo
5760 && STMT_SLP_TYPE (stmt_info)
5761 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5762 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5763 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5765 else if (!loop_vinfo
5766 || (nested_in_vect_loop
5767 && (TREE_INT_CST_LOW (DR_STEP (dr))
5768 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5769 return dr_explicit_realign;
5770 else
5771 return dr_explicit_realign_optimized;
5773 if (!known_alignment_for_access_p (dr))
5774 is_packed = not_size_aligned (DR_REF (dr));
5776 if (targetm.vectorize.support_vector_misalignment
5777 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5778 /* Can't software pipeline the loads, but can at least do them. */
5779 return dr_unaligned_supported;
5781 else
5783 bool is_packed = false;
5784 tree type = (TREE_TYPE (DR_REF (dr)));
5786 if (!known_alignment_for_access_p (dr))
5787 is_packed = not_size_aligned (DR_REF (dr));
5789 if (targetm.vectorize.support_vector_misalignment
5790 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5791 return dr_unaligned_supported;
5794 /* Unsupported. */
5795 return dr_unaligned_unsupported;