Add DR_BASE_ALIGNMENT and DR_BASE_MISALIGNMENT
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob3db408c274d2cd20b6a2922fdbd532eb6efce66c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
55 /* Return true if load- or store-lanes optab OPTAB is implemented for
56 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
58 static bool
59 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
60 tree vectype, unsigned HOST_WIDE_INT count)
62 machine_mode mode, array_mode;
63 bool limit_p;
65 mode = TYPE_MODE (vectype);
66 limit_p = !targetm.array_mode_supported_p (mode, count);
67 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
68 MODE_INT, limit_p);
70 if (array_mode == BLKmode)
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75 GET_MODE_NAME (mode), count);
76 return false;
79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "cannot use %s<%s><%s>\n", name,
84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 return false;
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE, vect_location,
90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91 GET_MODE_NAME (mode));
93 return true;
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112 types. */
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116 HOST_WIDE_INT *rhs_size_unit)
118 tree scalar_type = gimple_expr_type (stmt);
119 HOST_WIDE_INT lhs, rhs;
121 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
123 if (is_gimple_assign (stmt)
124 && (gimple_assign_cast_p (stmt)
125 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
126 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
127 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
129 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
131 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
132 if (rhs < lhs)
133 scalar_type = rhs_type;
136 *lhs_size_unit = lhs;
137 *rhs_size_unit = rhs;
138 return scalar_type;
142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
143 tested at run-time. Return TRUE if DDR was successfully inserted.
144 Return false if versioning is not supported. */
146 static bool
147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
149 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
151 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
152 return false;
154 if (!runtime_alias_check_p (ddr, loop,
155 optimize_loop_nest_for_speed_p (loop)))
156 return false;
158 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
159 return true;
163 /* Function vect_analyze_data_ref_dependence.
165 Return TRUE if there (might) exist a dependence between a memory-reference
166 DRA and a memory-reference DRB. When versioning for alias may check a
167 dependence at run-time, return FALSE. Adjust *MAX_VF according to
168 the data dependence. */
170 static bool
171 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
172 loop_vec_info loop_vinfo, int *max_vf)
174 unsigned int i;
175 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176 struct data_reference *dra = DDR_A (ddr);
177 struct data_reference *drb = DDR_B (ddr);
178 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
179 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
180 lambda_vector dist_v;
181 unsigned int loop_depth;
183 /* In loop analysis all data references should be vectorizable. */
184 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
185 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
186 gcc_unreachable ();
188 /* Independent data accesses. */
189 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
190 return false;
192 if (dra == drb
193 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
194 return false;
196 /* We do not have to consider dependences between accesses that belong
197 to the same group. */
198 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
199 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
200 return false;
202 /* Even if we have an anti-dependence then, as the vectorized loop covers at
203 least two scalar iterations, there is always also a true dependence.
204 As the vectorizer does not re-order loads and stores we can ignore
205 the anti-dependence if TBAA can disambiguate both DRs similar to the
206 case with known negative distance anti-dependences (positive
207 distance anti-dependences would violate TBAA constraints). */
208 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
209 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
210 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
211 get_alias_set (DR_REF (drb))))
212 return false;
214 /* Unknown data dependence. */
215 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
217 /* If user asserted safelen consecutive iterations can be
218 executed concurrently, assume independence. */
219 if (loop->safelen >= 2)
221 if (loop->safelen < *max_vf)
222 *max_vf = loop->safelen;
223 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
224 return false;
227 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
228 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
233 "versioning for alias not supported for: "
234 "can't determine dependence between ");
235 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
236 DR_REF (dra));
237 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
238 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
239 DR_REF (drb));
240 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
242 return true;
245 if (dump_enabled_p ())
247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
248 "versioning for alias required: "
249 "can't determine dependence between ");
250 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
251 DR_REF (dra));
252 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
253 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
254 DR_REF (drb));
255 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
258 /* Add to list of ddrs that need to be tested at run-time. */
259 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
262 /* Known data dependence. */
263 if (DDR_NUM_DIST_VECTS (ddr) == 0)
265 /* If user asserted safelen consecutive iterations can be
266 executed concurrently, assume independence. */
267 if (loop->safelen >= 2)
269 if (loop->safelen < *max_vf)
270 *max_vf = loop->safelen;
271 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
272 return false;
275 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
276 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 "versioning for alias not supported for: "
282 "bad dist vector for ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (dra));
285 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 DR_REF (drb));
288 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
290 return true;
293 if (dump_enabled_p ())
295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
296 "versioning for alias required: "
297 "bad dist vector for ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
299 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
300 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
303 /* Add to list of ddrs that need to be tested at run-time. */
304 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
307 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
308 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
310 int dist = dist_v[loop_depth];
312 if (dump_enabled_p ())
313 dump_printf_loc (MSG_NOTE, vect_location,
314 "dependence distance = %d.\n", dist);
316 if (dist == 0)
318 if (dump_enabled_p ())
320 dump_printf_loc (MSG_NOTE, vect_location,
321 "dependence distance == 0 between ");
322 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
323 dump_printf (MSG_NOTE, " and ");
324 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
325 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
328 /* When we perform grouped accesses and perform implicit CSE
329 by detecting equal accesses and doing disambiguation with
330 runtime alias tests like for
331 .. = a[i];
332 .. = a[i+1];
333 a[i] = ..;
334 a[i+1] = ..;
335 *p = ..;
336 .. = a[i];
337 .. = a[i+1];
338 where we will end up loading { a[i], a[i+1] } once, make
339 sure that inserting group loads before the first load and
340 stores after the last store will do the right thing.
341 Similar for groups like
342 a[i] = ...;
343 ... = a[i];
344 a[i+1] = ...;
345 where loads from the group interleave with the store. */
346 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
347 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
349 gimple *earlier_stmt;
350 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
351 if (DR_IS_WRITE
352 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356 "READ_WRITE dependence in interleaving."
357 "\n");
358 return true;
362 continue;
365 if (dist > 0 && DDR_REVERSED_P (ddr))
367 /* If DDR_REVERSED_P the order of the data-refs in DDR was
368 reversed (to make distance vector positive), and the actual
369 distance is negative. */
370 if (dump_enabled_p ())
371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
372 "dependence distance negative.\n");
373 /* Record a negative dependence distance to later limit the
374 amount of stmt copying / unrolling we can perform.
375 Only need to handle read-after-write dependence. */
376 if (DR_IS_READ (drb)
377 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
378 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
379 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
380 continue;
383 if (abs (dist) >= 2
384 && abs (dist) < *max_vf)
386 /* The dependence distance requires reduction of the maximal
387 vectorization factor. */
388 *max_vf = abs (dist);
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_NOTE, vect_location,
391 "adjusting maximal vectorization factor to %i\n",
392 *max_vf);
395 if (abs (dist) >= *max_vf)
397 /* Dependence distance does not create dependence, as far as
398 vectorization is concerned, in this case. */
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location,
401 "dependence distance >= VF.\n");
402 continue;
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "not vectorized, possible dependence "
409 "between data-refs ");
410 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
411 dump_printf (MSG_NOTE, " and ");
412 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
413 dump_printf (MSG_NOTE, "\n");
416 return true;
419 return false;
422 /* Function vect_analyze_data_ref_dependences.
424 Examine all the data references in the loop, and make sure there do not
425 exist any data dependences between them. Set *MAX_VF according to
426 the maximum vectorization factor the data dependences allow. */
428 bool
429 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
431 unsigned int i;
432 struct data_dependence_relation *ddr;
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "=== vect_analyze_data_ref_dependences ===\n");
438 LOOP_VINFO_DDRS (loop_vinfo)
439 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
440 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
441 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
442 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
443 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
444 &LOOP_VINFO_DDRS (loop_vinfo),
445 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
446 return false;
448 /* For epilogues we either have no aliases or alias versioning
449 was applied to original loop. Therefore we may just get max_vf
450 using VF of original loop. */
451 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
452 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
453 else
454 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
455 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
456 return false;
458 return true;
462 /* Function vect_slp_analyze_data_ref_dependence.
464 Return TRUE if there (might) exist a dependence between a memory-reference
465 DRA and a memory-reference DRB. When versioning for alias may check a
466 dependence at run-time, return FALSE. Adjust *MAX_VF according to
467 the data dependence. */
469 static bool
470 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
472 struct data_reference *dra = DDR_A (ddr);
473 struct data_reference *drb = DDR_B (ddr);
475 /* We need to check dependences of statements marked as unvectorizable
476 as well, they still can prohibit vectorization. */
478 /* Independent data accesses. */
479 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
480 return false;
482 if (dra == drb)
483 return false;
485 /* Read-read is OK. */
486 if (DR_IS_READ (dra) && DR_IS_READ (drb))
487 return false;
489 /* If dra and drb are part of the same interleaving chain consider
490 them independent. */
491 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
492 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
493 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
494 return false;
496 /* Unknown data dependence. */
497 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
499 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "can't determine dependence between ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
504 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
505 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
506 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
509 else if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE, vect_location,
512 "determined dependence between ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
514 dump_printf (MSG_NOTE, " and ");
515 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
516 dump_printf (MSG_NOTE, "\n");
519 return true;
523 /* Analyze dependences involved in the transform of SLP NODE. STORES
524 contain the vector of scalar stores of this instance if we are
525 disambiguating the loads. */
527 static bool
528 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
529 vec<gimple *> stores, gimple *last_store)
531 /* This walks over all stmts involved in the SLP load/store done
532 in NODE verifying we can sink them up to the last stmt in the
533 group. */
534 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
535 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
537 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
538 if (access == last_access)
539 continue;
540 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
541 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
542 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
544 gimple *stmt = gsi_stmt (gsi);
545 if (! gimple_vuse (stmt)
546 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
547 continue;
549 /* If we couldn't record a (single) data reference for this
550 stmt we have to give up. */
551 /* ??? Here and below if dependence analysis fails we can resort
552 to the alias oracle which can handle more kinds of stmts. */
553 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
554 if (!dr_b)
555 return false;
557 bool dependent = false;
558 /* If we run into a store of this same instance (we've just
559 marked those) then delay dependence checking until we run
560 into the last store because this is where it will have
561 been sunk to (and we verify if we can do that as well). */
562 if (gimple_visited_p (stmt))
564 if (stmt != last_store)
565 continue;
566 unsigned i;
567 gimple *store;
568 FOR_EACH_VEC_ELT (stores, i, store)
570 data_reference *store_dr
571 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
572 ddr_p ddr = initialize_data_dependence_relation
573 (dr_a, store_dr, vNULL);
574 dependent = vect_slp_analyze_data_ref_dependence (ddr);
575 free_dependence_relation (ddr);
576 if (dependent)
577 break;
580 else
582 ddr_p ddr = initialize_data_dependence_relation (dr_a,
583 dr_b, vNULL);
584 dependent = vect_slp_analyze_data_ref_dependence (ddr);
585 free_dependence_relation (ddr);
587 if (dependent)
588 return false;
591 return true;
595 /* Function vect_analyze_data_ref_dependences.
597 Examine all the data references in the basic-block, and make sure there
598 do not exist any data dependences between them. Set *MAX_VF according to
599 the maximum vectorization factor the data dependences allow. */
601 bool
602 vect_slp_analyze_instance_dependence (slp_instance instance)
604 if (dump_enabled_p ())
605 dump_printf_loc (MSG_NOTE, vect_location,
606 "=== vect_slp_analyze_instance_dependence ===\n");
608 /* The stores of this instance are at the root of the SLP tree. */
609 slp_tree store = SLP_INSTANCE_TREE (instance);
610 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
611 store = NULL;
613 /* Verify we can sink stores to the vectorized stmt insert location. */
614 gimple *last_store = NULL;
615 if (store)
617 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
618 return false;
620 /* Mark stores in this instance and remember the last one. */
621 last_store = vect_find_last_scalar_stmt_in_slp (store);
622 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
623 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
626 bool res = true;
628 /* Verify we can sink loads to the vectorized stmt insert location,
629 special-casing stores of this instance. */
630 slp_tree load;
631 unsigned int i;
632 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
633 if (! vect_slp_analyze_node_dependences (instance, load,
634 store
635 ? SLP_TREE_SCALAR_STMTS (store)
636 : vNULL, last_store))
638 res = false;
639 break;
642 /* Unset the visited flag. */
643 if (store)
644 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
645 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
647 return res;
650 /* Function vect_compute_data_ref_alignment
652 Compute the misalignment of the data reference DR.
654 Output:
655 1. If during the misalignment computation it is found that the data reference
656 cannot be vectorized then false is returned.
657 2. DR_MISALIGNMENT (DR) is defined.
659 FOR NOW: No analysis is actually performed. Misalignment is calculated
660 only for trivial cases. TODO. */
662 bool
663 vect_compute_data_ref_alignment (struct data_reference *dr)
665 gimple *stmt = DR_STMT (dr);
666 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
667 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
668 struct loop *loop = NULL;
669 tree ref = DR_REF (dr);
670 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE, vect_location,
674 "vect_compute_data_ref_alignment:\n");
676 if (loop_vinfo)
677 loop = LOOP_VINFO_LOOP (loop_vinfo);
679 /* Initialize misalignment to unknown. */
680 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
682 innermost_loop_behavior *drb = vect_dr_behavior (dr);
683 bool step_preserves_misalignment_p;
685 /* No step for BB vectorization. */
686 if (!loop)
688 gcc_assert (integer_zerop (drb->step));
689 step_preserves_misalignment_p = true;
692 /* In case the dataref is in an inner-loop of the loop that is being
693 vectorized (LOOP), we use the base and misalignment information
694 relative to the outer-loop (LOOP). This is ok only if the misalignment
695 stays the same throughout the execution of the inner-loop, which is why
696 we have to check that the stride of the dataref in the inner-loop evenly
697 divides by the vector size. */
698 else if (nested_in_vect_loop_p (loop, stmt))
700 step_preserves_misalignment_p
701 = (DR_STEP_ALIGNMENT (dr)
702 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
704 if (dump_enabled_p ())
706 if (step_preserves_misalignment_p)
707 dump_printf_loc (MSG_NOTE, vect_location,
708 "inner step divides the vector-size.\n");
709 else
710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
711 "inner step doesn't divide the vector-size.\n");
715 /* Similarly we can only use base and misalignment information relative to
716 an innermost loop if the misalignment stays the same throughout the
717 execution of the loop. As above, this is the case if the stride of
718 the dataref evenly divides by the vector size. */
719 else
721 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
722 step_preserves_misalignment_p
723 = ((DR_STEP_ALIGNMENT (dr) * vf)
724 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
726 if (!step_preserves_misalignment_p && dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "step doesn't divide the vector-size.\n");
731 unsigned int base_alignment = drb->base_alignment;
732 unsigned int base_misalignment = drb->base_misalignment;
733 unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
734 unsigned HOST_WIDE_INT element_alignment
735 = TYPE_ALIGN_UNIT (TREE_TYPE (vectype));
737 if (base_alignment >= element_alignment
738 && (base_misalignment & (element_alignment - 1)) == 0)
739 DR_VECT_AUX (dr)->base_element_aligned = true;
741 if (drb->offset_alignment < vector_alignment
742 || !step_preserves_misalignment_p
743 /* We need to know whether the step wrt the vectorized loop is
744 negative when computing the starting misalignment below. */
745 || TREE_CODE (drb->step) != INTEGER_CST)
747 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "Unknown alignment for access: ");
751 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
752 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
754 return true;
757 if (base_alignment < vector_alignment)
759 tree base = drb->base_address;
760 if (TREE_CODE (base) == ADDR_EXPR)
761 base = TREE_OPERAND (base, 0);
762 if (!vect_can_force_dr_alignment_p (base,
763 vector_alignment * BITS_PER_UNIT))
765 if (dump_enabled_p ())
767 dump_printf_loc (MSG_NOTE, vect_location,
768 "can't force alignment of ref: ");
769 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
770 dump_printf (MSG_NOTE, "\n");
772 return true;
775 if (DECL_USER_ALIGN (base))
777 if (dump_enabled_p ())
779 dump_printf_loc (MSG_NOTE, vect_location,
780 "not forcing alignment of user-aligned "
781 "variable: ");
782 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
783 dump_printf (MSG_NOTE, "\n");
785 return true;
788 /* Force the alignment of the decl.
789 NOTE: This is the only change to the code we make during
790 the analysis phase, before deciding to vectorize the loop. */
791 if (dump_enabled_p ())
793 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
794 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
795 dump_printf (MSG_NOTE, "\n");
798 DR_VECT_AUX (dr)->base_decl = base;
799 DR_VECT_AUX (dr)->base_misaligned = true;
800 DR_VECT_AUX (dr)->base_element_aligned = true;
801 base_misalignment = 0;
803 unsigned int misalignment = (base_misalignment
804 + TREE_INT_CST_LOW (drb->init));
806 /* If this is a backward running DR then first access in the larger
807 vectype actually is N-1 elements before the address in the DR.
808 Adjust misalign accordingly. */
809 if (tree_int_cst_sgn (drb->step) < 0)
810 /* PLUS because STEP is negative. */
811 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
812 * TREE_INT_CST_LOW (drb->step));
814 SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
816 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
820 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
821 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
824 return true;
828 /* Function vect_update_misalignment_for_peel.
829 Sets DR's misalignment
830 - to 0 if it has the same alignment as DR_PEEL,
831 - to the misalignment computed using NPEEL if DR's salignment is known,
832 - to -1 (unknown) otherwise.
834 DR - the data reference whose misalignment is to be adjusted.
835 DR_PEEL - the data reference whose misalignment is being made
836 zero in the vector loop by the peel.
837 NPEEL - the number of iterations in the peel loop if the misalignment
838 of DR_PEEL is known at compile time. */
840 static void
841 vect_update_misalignment_for_peel (struct data_reference *dr,
842 struct data_reference *dr_peel, int npeel)
844 unsigned int i;
845 vec<dr_p> same_aligned_drs;
846 struct data_reference *current_dr;
847 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
848 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
849 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
850 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
852 /* For interleaved data accesses the step in the loop must be multiplied by
853 the size of the interleaving group. */
854 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
855 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
856 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
857 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
859 /* It can be assumed that the data refs with the same alignment as dr_peel
860 are aligned in the vector loop. */
861 same_aligned_drs
862 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
863 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
865 if (current_dr != dr)
866 continue;
867 gcc_assert (!known_alignment_for_access_p (dr)
868 || !known_alignment_for_access_p (dr_peel)
869 || (DR_MISALIGNMENT (dr) / dr_size
870 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
871 SET_DR_MISALIGNMENT (dr, 0);
872 return;
875 if (known_alignment_for_access_p (dr)
876 && known_alignment_for_access_p (dr_peel))
878 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
879 int misal = DR_MISALIGNMENT (dr);
880 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
881 misal += negative ? -npeel * dr_size : npeel * dr_size;
882 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
883 SET_DR_MISALIGNMENT (dr, misal);
884 return;
887 if (dump_enabled_p ())
888 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
889 "to unknown (-1).\n");
890 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
894 /* Function verify_data_ref_alignment
896 Return TRUE if DR can be handled with respect to alignment. */
898 static bool
899 verify_data_ref_alignment (data_reference_p dr)
901 enum dr_alignment_support supportable_dr_alignment
902 = vect_supportable_dr_alignment (dr, false);
903 if (!supportable_dr_alignment)
905 if (dump_enabled_p ())
907 if (DR_IS_READ (dr))
908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 "not vectorized: unsupported unaligned load.");
910 else
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "not vectorized: unsupported unaligned "
913 "store.");
915 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
916 DR_REF (dr));
917 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
919 return false;
922 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
923 dump_printf_loc (MSG_NOTE, vect_location,
924 "Vectorizing an unaligned access.\n");
926 return true;
929 /* Function vect_verify_datarefs_alignment
931 Return TRUE if all data references in the loop can be
932 handled with respect to alignment. */
934 bool
935 vect_verify_datarefs_alignment (loop_vec_info vinfo)
937 vec<data_reference_p> datarefs = vinfo->datarefs;
938 struct data_reference *dr;
939 unsigned int i;
941 FOR_EACH_VEC_ELT (datarefs, i, dr)
943 gimple *stmt = DR_STMT (dr);
944 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
946 if (!STMT_VINFO_RELEVANT_P (stmt_info))
947 continue;
949 /* For interleaving, only the alignment of the first access matters. */
950 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
951 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
952 continue;
954 /* Strided accesses perform only component accesses, alignment is
955 irrelevant for them. */
956 if (STMT_VINFO_STRIDED_P (stmt_info)
957 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
958 continue;
960 if (! verify_data_ref_alignment (dr))
961 return false;
964 return true;
967 /* Given an memory reference EXP return whether its alignment is less
968 than its size. */
970 static bool
971 not_size_aligned (tree exp)
973 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
974 return true;
976 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
977 > get_object_alignment (exp));
980 /* Function vector_alignment_reachable_p
982 Return true if vector alignment for DR is reachable by peeling
983 a few loop iterations. Return false otherwise. */
985 static bool
986 vector_alignment_reachable_p (struct data_reference *dr)
988 gimple *stmt = DR_STMT (dr);
989 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
990 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
992 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
994 /* For interleaved access we peel only if number of iterations in
995 the prolog loop ({VF - misalignment}), is a multiple of the
996 number of the interleaved accesses. */
997 int elem_size, mis_in_elements;
998 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1000 /* FORNOW: handle only known alignment. */
1001 if (!known_alignment_for_access_p (dr))
1002 return false;
1004 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1005 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1007 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1008 return false;
1011 /* If misalignment is known at the compile time then allow peeling
1012 only if natural alignment is reachable through peeling. */
1013 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1015 HOST_WIDE_INT elmsize =
1016 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1017 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_NOTE, vect_location,
1020 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1021 dump_printf (MSG_NOTE,
1022 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1024 if (DR_MISALIGNMENT (dr) % elmsize)
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1028 "data size does not divide the misalignment.\n");
1029 return false;
1033 if (!known_alignment_for_access_p (dr))
1035 tree type = TREE_TYPE (DR_REF (dr));
1036 bool is_packed = not_size_aligned (DR_REF (dr));
1037 if (dump_enabled_p ())
1038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039 "Unknown misalignment, %snaturally aligned\n",
1040 is_packed ? "not " : "");
1041 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1044 return true;
1048 /* Calculate the cost of the memory access represented by DR. */
1050 static void
1051 vect_get_data_access_cost (struct data_reference *dr,
1052 unsigned int *inside_cost,
1053 unsigned int *outside_cost,
1054 stmt_vector_for_cost *body_cost_vec)
1056 gimple *stmt = DR_STMT (dr);
1057 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1058 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1059 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1060 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1061 int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly */
1063 if (DR_IS_READ (dr))
1064 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1065 NULL, body_cost_vec, false);
1066 else
1067 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1069 if (dump_enabled_p ())
1070 dump_printf_loc (MSG_NOTE, vect_location,
1071 "vect_get_data_access_cost: inside_cost = %d, "
1072 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1076 typedef struct _vect_peel_info
1078 struct data_reference *dr;
1079 int npeel;
1080 unsigned int count;
1081 } *vect_peel_info;
1083 typedef struct _vect_peel_extended_info
1085 struct _vect_peel_info peel_info;
1086 unsigned int inside_cost;
1087 unsigned int outside_cost;
1088 stmt_vector_for_cost body_cost_vec;
1089 } *vect_peel_extended_info;
1092 /* Peeling hashtable helpers. */
1094 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1096 static inline hashval_t hash (const _vect_peel_info *);
1097 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1100 inline hashval_t
1101 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1103 return (hashval_t) peel_info->npeel;
1106 inline bool
1107 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1109 return (a->npeel == b->npeel);
1113 /* Insert DR into peeling hash table with NPEEL as key. */
1115 static void
1116 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1117 loop_vec_info loop_vinfo, struct data_reference *dr,
1118 int npeel)
1120 struct _vect_peel_info elem, *slot;
1121 _vect_peel_info **new_slot;
1122 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1124 elem.npeel = npeel;
1125 slot = peeling_htab->find (&elem);
1126 if (slot)
1127 slot->count++;
1128 else
1130 slot = XNEW (struct _vect_peel_info);
1131 slot->npeel = npeel;
1132 slot->dr = dr;
1133 slot->count = 1;
1134 new_slot = peeling_htab->find_slot (slot, INSERT);
1135 *new_slot = slot;
1138 if (!supportable_dr_alignment
1139 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1140 slot->count += VECT_MAX_COST;
1144 /* Traverse peeling hash table to find peeling option that aligns maximum
1145 number of data accesses. */
1148 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1149 _vect_peel_extended_info *max)
1151 vect_peel_info elem = *slot;
1153 if (elem->count > max->peel_info.count
1154 || (elem->count == max->peel_info.count
1155 && max->peel_info.npeel > elem->npeel))
1157 max->peel_info.npeel = elem->npeel;
1158 max->peel_info.count = elem->count;
1159 max->peel_info.dr = elem->dr;
1162 return 1;
1165 /* Get the costs of peeling NPEEL iterations checking data access costs
1166 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1167 misalignment will be zero after peeling. */
1169 static void
1170 vect_get_peeling_costs_all_drs (struct data_reference *dr0,
1171 unsigned int *inside_cost,
1172 unsigned int *outside_cost,
1173 stmt_vector_for_cost *body_cost_vec,
1174 unsigned int npeel,
1175 bool unknown_misalignment)
1177 gimple *stmt = DR_STMT (dr0);
1178 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1179 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1180 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1182 unsigned i;
1183 data_reference *dr;
1185 FOR_EACH_VEC_ELT (datarefs, i, dr)
1187 stmt = DR_STMT (dr);
1188 stmt_info = vinfo_for_stmt (stmt);
1189 /* For interleaving, only the alignment of the first access
1190 matters. */
1191 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1192 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1193 continue;
1195 /* Strided accesses perform only component accesses, alignment is
1196 irrelevant for them. */
1197 if (STMT_VINFO_STRIDED_P (stmt_info)
1198 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1199 continue;
1201 int save_misalignment;
1202 save_misalignment = DR_MISALIGNMENT (dr);
1203 if (unknown_misalignment && dr == dr0)
1204 SET_DR_MISALIGNMENT (dr, 0);
1205 else
1206 vect_update_misalignment_for_peel (dr, dr0, npeel);
1207 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1208 body_cost_vec);
1209 SET_DR_MISALIGNMENT (dr, save_misalignment);
1213 /* Traverse peeling hash table and calculate cost for each peeling option.
1214 Find the one with the lowest cost. */
1217 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1218 _vect_peel_extended_info *min)
1220 vect_peel_info elem = *slot;
1221 int dummy;
1222 unsigned int inside_cost = 0, outside_cost = 0;
1223 gimple *stmt = DR_STMT (elem->dr);
1224 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1225 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1226 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1227 epilogue_cost_vec;
1229 prologue_cost_vec.create (2);
1230 body_cost_vec.create (2);
1231 epilogue_cost_vec.create (2);
1233 vect_get_peeling_costs_all_drs (elem->dr, &inside_cost, &outside_cost,
1234 &body_cost_vec, elem->npeel, false);
1236 outside_cost += vect_get_known_peeling_cost
1237 (loop_vinfo, elem->npeel, &dummy,
1238 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1239 &prologue_cost_vec, &epilogue_cost_vec);
1241 /* Prologue and epilogue costs are added to the target model later.
1242 These costs depend only on the scalar iteration cost, the
1243 number of peeling iterations finally chosen, and the number of
1244 misaligned statements. So discard the information found here. */
1245 prologue_cost_vec.release ();
1246 epilogue_cost_vec.release ();
1248 if (inside_cost < min->inside_cost
1249 || (inside_cost == min->inside_cost
1250 && outside_cost < min->outside_cost))
1252 min->inside_cost = inside_cost;
1253 min->outside_cost = outside_cost;
1254 min->body_cost_vec.release ();
1255 min->body_cost_vec = body_cost_vec;
1256 min->peel_info.dr = elem->dr;
1257 min->peel_info.npeel = elem->npeel;
1258 min->peel_info.count = elem->count;
1260 else
1261 body_cost_vec.release ();
1263 return 1;
1267 /* Choose best peeling option by traversing peeling hash table and either
1268 choosing an option with the lowest cost (if cost model is enabled) or the
1269 option that aligns as many accesses as possible. */
1271 static struct _vect_peel_extended_info
1272 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1273 loop_vec_info loop_vinfo,
1274 unsigned int *npeel,
1275 stmt_vector_for_cost *body_cost_vec)
1277 struct _vect_peel_extended_info res;
1279 res.peel_info.dr = NULL;
1280 res.body_cost_vec = stmt_vector_for_cost ();
1282 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1284 res.inside_cost = INT_MAX;
1285 res.outside_cost = INT_MAX;
1286 peeling_htab->traverse <_vect_peel_extended_info *,
1287 vect_peeling_hash_get_lowest_cost> (&res);
1289 else
1291 res.peel_info.count = 0;
1292 peeling_htab->traverse <_vect_peel_extended_info *,
1293 vect_peeling_hash_get_most_frequent> (&res);
1294 res.inside_cost = 0;
1295 res.outside_cost = 0;
1298 *npeel = res.peel_info.npeel;
1299 *body_cost_vec = res.body_cost_vec;
1300 return res;
1303 /* Return true if the new peeling NPEEL is supported. */
1305 static bool
1306 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1307 unsigned npeel)
1309 unsigned i;
1310 struct data_reference *dr = NULL;
1311 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1312 gimple *stmt;
1313 stmt_vec_info stmt_info;
1314 enum dr_alignment_support supportable_dr_alignment;
1316 /* Ensure that all data refs can be vectorized after the peel. */
1317 FOR_EACH_VEC_ELT (datarefs, i, dr)
1319 int save_misalignment;
1321 if (dr == dr0)
1322 continue;
1324 stmt = DR_STMT (dr);
1325 stmt_info = vinfo_for_stmt (stmt);
1326 /* For interleaving, only the alignment of the first access
1327 matters. */
1328 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1329 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1330 continue;
1332 /* Strided accesses perform only component accesses, alignment is
1333 irrelevant for them. */
1334 if (STMT_VINFO_STRIDED_P (stmt_info)
1335 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1336 continue;
1338 save_misalignment = DR_MISALIGNMENT (dr);
1339 vect_update_misalignment_for_peel (dr, dr0, npeel);
1340 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1341 SET_DR_MISALIGNMENT (dr, save_misalignment);
1343 if (!supportable_dr_alignment)
1344 return false;
1347 return true;
1350 /* Function vect_enhance_data_refs_alignment
1352 This pass will use loop versioning and loop peeling in order to enhance
1353 the alignment of data references in the loop.
1355 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1356 original loop is to be vectorized. Any other loops that are created by
1357 the transformations performed in this pass - are not supposed to be
1358 vectorized. This restriction will be relaxed.
1360 This pass will require a cost model to guide it whether to apply peeling
1361 or versioning or a combination of the two. For example, the scheme that
1362 intel uses when given a loop with several memory accesses, is as follows:
1363 choose one memory access ('p') which alignment you want to force by doing
1364 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1365 other accesses are not necessarily aligned, or (2) use loop versioning to
1366 generate one loop in which all accesses are aligned, and another loop in
1367 which only 'p' is necessarily aligned.
1369 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1370 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1371 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1373 Devising a cost model is the most critical aspect of this work. It will
1374 guide us on which access to peel for, whether to use loop versioning, how
1375 many versions to create, etc. The cost model will probably consist of
1376 generic considerations as well as target specific considerations (on
1377 powerpc for example, misaligned stores are more painful than misaligned
1378 loads).
1380 Here are the general steps involved in alignment enhancements:
1382 -- original loop, before alignment analysis:
1383 for (i=0; i<N; i++){
1384 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1385 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1388 -- After vect_compute_data_refs_alignment:
1389 for (i=0; i<N; i++){
1390 x = q[i]; # DR_MISALIGNMENT(q) = 3
1391 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1394 -- Possibility 1: we do loop versioning:
1395 if (p is aligned) {
1396 for (i=0; i<N; i++){ # loop 1A
1397 x = q[i]; # DR_MISALIGNMENT(q) = 3
1398 p[i] = y; # DR_MISALIGNMENT(p) = 0
1401 else {
1402 for (i=0; i<N; i++){ # loop 1B
1403 x = q[i]; # DR_MISALIGNMENT(q) = 3
1404 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1408 -- Possibility 2: we do loop peeling:
1409 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1410 x = q[i];
1411 p[i] = y;
1413 for (i = 3; i < N; i++){ # loop 2A
1414 x = q[i]; # DR_MISALIGNMENT(q) = 0
1415 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1418 -- Possibility 3: combination of loop peeling and versioning:
1419 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1420 x = q[i];
1421 p[i] = y;
1423 if (p is aligned) {
1424 for (i = 3; i<N; i++){ # loop 3A
1425 x = q[i]; # DR_MISALIGNMENT(q) = 0
1426 p[i] = y; # DR_MISALIGNMENT(p) = 0
1429 else {
1430 for (i = 3; i<N; i++){ # loop 3B
1431 x = q[i]; # DR_MISALIGNMENT(q) = 0
1432 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1436 These loops are later passed to loop_transform to be vectorized. The
1437 vectorizer will use the alignment information to guide the transformation
1438 (whether to generate regular loads/stores, or with special handling for
1439 misalignment). */
1441 bool
1442 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1444 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1445 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1446 enum dr_alignment_support supportable_dr_alignment;
1447 struct data_reference *dr0 = NULL, *first_store = NULL;
1448 struct data_reference *dr;
1449 unsigned int i, j;
1450 bool do_peeling = false;
1451 bool do_versioning = false;
1452 bool stat;
1453 gimple *stmt;
1454 stmt_vec_info stmt_info;
1455 unsigned int npeel = 0;
1456 bool one_misalignment_known = false;
1457 bool one_misalignment_unknown = false;
1458 bool one_dr_unsupportable = false;
1459 struct data_reference *unsupportable_dr = NULL;
1460 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1461 unsigned possible_npeel_number = 1;
1462 tree vectype;
1463 unsigned int nelements, mis, same_align_drs_max = 0;
1464 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1465 hash_table<peel_info_hasher> peeling_htab (1);
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_NOTE, vect_location,
1469 "=== vect_enhance_data_refs_alignment ===\n");
1471 /* Reset data so we can safely be called multiple times. */
1472 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1473 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1475 /* While cost model enhancements are expected in the future, the high level
1476 view of the code at this time is as follows:
1478 A) If there is a misaligned access then see if peeling to align
1479 this access can make all data references satisfy
1480 vect_supportable_dr_alignment. If so, update data structures
1481 as needed and return true.
1483 B) If peeling wasn't possible and there is a data reference with an
1484 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1485 then see if loop versioning checks can be used to make all data
1486 references satisfy vect_supportable_dr_alignment. If so, update
1487 data structures as needed and return true.
1489 C) If neither peeling nor versioning were successful then return false if
1490 any data reference does not satisfy vect_supportable_dr_alignment.
1492 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1494 Note, Possibility 3 above (which is peeling and versioning together) is not
1495 being done at this time. */
1497 /* (1) Peeling to force alignment. */
1499 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1500 Considerations:
1501 + How many accesses will become aligned due to the peeling
1502 - How many accesses will become unaligned due to the peeling,
1503 and the cost of misaligned accesses.
1504 - The cost of peeling (the extra runtime checks, the increase
1505 in code size). */
1507 FOR_EACH_VEC_ELT (datarefs, i, dr)
1509 stmt = DR_STMT (dr);
1510 stmt_info = vinfo_for_stmt (stmt);
1512 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1513 continue;
1515 /* For interleaving, only the alignment of the first access
1516 matters. */
1517 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1518 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1519 continue;
1521 /* For invariant accesses there is nothing to enhance. */
1522 if (integer_zerop (DR_STEP (dr)))
1523 continue;
1525 /* Strided accesses perform only component accesses, alignment is
1526 irrelevant for them. */
1527 if (STMT_VINFO_STRIDED_P (stmt_info)
1528 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1529 continue;
1531 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1532 do_peeling = vector_alignment_reachable_p (dr);
1533 if (do_peeling)
1535 if (known_alignment_for_access_p (dr))
1537 unsigned int npeel_tmp = 0;
1538 bool negative = tree_int_cst_compare (DR_STEP (dr),
1539 size_zero_node) < 0;
1541 vectype = STMT_VINFO_VECTYPE (stmt_info);
1542 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1543 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1544 TREE_TYPE (DR_REF (dr))));
1545 if (DR_MISALIGNMENT (dr) != 0)
1546 npeel_tmp = (negative ? (mis - nelements)
1547 : (nelements - mis)) & (nelements - 1);
1549 /* For multiple types, it is possible that the bigger type access
1550 will have more than one peeling option. E.g., a loop with two
1551 types: one of size (vector size / 4), and the other one of
1552 size (vector size / 8). Vectorization factor will 8. If both
1553 accesses are misaligned by 3, the first one needs one scalar
1554 iteration to be aligned, and the second one needs 5. But the
1555 first one will be aligned also by peeling 5 scalar
1556 iterations, and in that case both accesses will be aligned.
1557 Hence, except for the immediate peeling amount, we also want
1558 to try to add full vector size, while we don't exceed
1559 vectorization factor.
1560 We do this automatically for cost model, since we calculate
1561 cost for every peeling option. */
1562 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1564 if (STMT_SLP_TYPE (stmt_info))
1565 possible_npeel_number
1566 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1567 else
1568 possible_npeel_number = vf / nelements;
1570 /* NPEEL_TMP is 0 when there is no misalignment, but also
1571 allow peeling NELEMENTS. */
1572 if (DR_MISALIGNMENT (dr) == 0)
1573 possible_npeel_number++;
1576 /* Save info about DR in the hash table. Also include peeling
1577 amounts according to the explanation above. */
1578 for (j = 0; j < possible_npeel_number; j++)
1580 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1581 dr, npeel_tmp);
1582 npeel_tmp += nelements;
1585 one_misalignment_known = true;
1587 else
1589 /* If we don't know any misalignment values, we prefer
1590 peeling for data-ref that has the maximum number of data-refs
1591 with the same alignment, unless the target prefers to align
1592 stores over load. */
1593 unsigned same_align_drs
1594 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1595 if (!dr0
1596 || same_align_drs_max < same_align_drs)
1598 same_align_drs_max = same_align_drs;
1599 dr0 = dr;
1601 /* For data-refs with the same number of related
1602 accesses prefer the one where the misalign
1603 computation will be invariant in the outermost loop. */
1604 else if (same_align_drs_max == same_align_drs)
1606 struct loop *ivloop0, *ivloop;
1607 ivloop0 = outermost_invariant_loop_for_expr
1608 (loop, DR_BASE_ADDRESS (dr0));
1609 ivloop = outermost_invariant_loop_for_expr
1610 (loop, DR_BASE_ADDRESS (dr));
1611 if ((ivloop && !ivloop0)
1612 || (ivloop && ivloop0
1613 && flow_loop_nested_p (ivloop, ivloop0)))
1614 dr0 = dr;
1617 one_misalignment_unknown = true;
1619 /* Check for data refs with unsupportable alignment that
1620 can be peeled. */
1621 if (!supportable_dr_alignment)
1623 one_dr_unsupportable = true;
1624 unsupportable_dr = dr;
1627 if (!first_store && DR_IS_WRITE (dr))
1628 first_store = dr;
1631 else
1633 if (!aligned_access_p (dr))
1635 if (dump_enabled_p ())
1636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637 "vector alignment may not be reachable\n");
1638 break;
1643 /* Check if we can possibly peel the loop. */
1644 if (!vect_can_advance_ivs_p (loop_vinfo)
1645 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1646 || loop->inner)
1647 do_peeling = false;
1649 struct _vect_peel_extended_info peel_for_known_alignment;
1650 struct _vect_peel_extended_info peel_for_unknown_alignment;
1651 struct _vect_peel_extended_info best_peel;
1653 peel_for_unknown_alignment.inside_cost = INT_MAX;
1654 peel_for_unknown_alignment.outside_cost = INT_MAX;
1655 peel_for_unknown_alignment.peel_info.count = 0;
1657 if (do_peeling
1658 && one_misalignment_unknown)
1660 /* Check if the target requires to prefer stores over loads, i.e., if
1661 misaligned stores are more expensive than misaligned loads (taking
1662 drs with same alignment into account). */
1663 unsigned int load_inside_cost = 0;
1664 unsigned int load_outside_cost = 0;
1665 unsigned int store_inside_cost = 0;
1666 unsigned int store_outside_cost = 0;
1668 stmt_vector_for_cost dummy;
1669 dummy.create (2);
1670 vect_get_peeling_costs_all_drs (dr0,
1671 &load_inside_cost,
1672 &load_outside_cost,
1673 &dummy, vf / 2, true);
1674 dummy.release ();
1676 if (first_store)
1678 dummy.create (2);
1679 vect_get_peeling_costs_all_drs (first_store,
1680 &store_inside_cost,
1681 &store_outside_cost,
1682 &dummy, vf / 2, true);
1683 dummy.release ();
1685 else
1687 store_inside_cost = INT_MAX;
1688 store_outside_cost = INT_MAX;
1691 if (load_inside_cost > store_inside_cost
1692 || (load_inside_cost == store_inside_cost
1693 && load_outside_cost > store_outside_cost))
1695 dr0 = first_store;
1696 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1697 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1699 else
1701 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1702 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1705 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1706 prologue_cost_vec.create (2);
1707 epilogue_cost_vec.create (2);
1709 int dummy2;
1710 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1711 (loop_vinfo, vf / 2, &dummy2,
1712 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1713 &prologue_cost_vec, &epilogue_cost_vec);
1715 prologue_cost_vec.release ();
1716 epilogue_cost_vec.release ();
1718 peel_for_unknown_alignment.peel_info.count = 1
1719 + STMT_VINFO_SAME_ALIGN_REFS
1720 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1723 peel_for_unknown_alignment.peel_info.npeel = 0;
1724 peel_for_unknown_alignment.peel_info.dr = dr0;
1726 best_peel = peel_for_unknown_alignment;
1728 peel_for_known_alignment.inside_cost = INT_MAX;
1729 peel_for_known_alignment.outside_cost = INT_MAX;
1730 peel_for_known_alignment.peel_info.count = 0;
1731 peel_for_known_alignment.peel_info.dr = NULL;
1733 if (do_peeling && one_misalignment_known)
1735 /* Peeling is possible, but there is no data access that is not supported
1736 unless aligned. So we try to choose the best possible peeling from
1737 the hash table. */
1738 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1739 (&peeling_htab, loop_vinfo, &npeel, &body_cost_vec);
1742 /* Compare costs of peeling for known and unknown alignment. */
1743 if (peel_for_known_alignment.peel_info.dr != NULL
1744 && peel_for_unknown_alignment.inside_cost
1745 >= peel_for_known_alignment.inside_cost)
1747 best_peel = peel_for_known_alignment;
1749 /* If the best peeling for known alignment has NPEEL == 0, perform no
1750 peeling at all except if there is an unsupportable dr that we can
1751 align. */
1752 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1753 do_peeling = false;
1756 /* If there is an unsupportable data ref, prefer this over all choices so far
1757 since we'd have to discard a chosen peeling except when it accidentally
1758 aligned the unsupportable data ref. */
1759 if (one_dr_unsupportable)
1760 dr0 = unsupportable_dr;
1761 else if (do_peeling)
1763 /* Calculate the penalty for no peeling, i.e. leaving everything
1764 unaligned.
1765 TODO: Adapt vect_get_peeling_costs_all_drs and use here. */
1766 unsigned nopeel_inside_cost = 0;
1767 unsigned nopeel_outside_cost = 0;
1769 stmt_vector_for_cost dummy;
1770 dummy.create (2);
1771 FOR_EACH_VEC_ELT (datarefs, i, dr)
1772 vect_get_data_access_cost (dr, &nopeel_inside_cost,
1773 &nopeel_outside_cost, &dummy);
1774 dummy.release ();
1776 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1777 costs will be recorded. */
1778 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1779 prologue_cost_vec.create (2);
1780 epilogue_cost_vec.create (2);
1782 int dummy2;
1783 nopeel_outside_cost += vect_get_known_peeling_cost
1784 (loop_vinfo, 0, &dummy2,
1785 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1786 &prologue_cost_vec, &epilogue_cost_vec);
1788 prologue_cost_vec.release ();
1789 epilogue_cost_vec.release ();
1791 npeel = best_peel.peel_info.npeel;
1792 dr0 = best_peel.peel_info.dr;
1794 /* If no peeling is not more expensive than the best peeling we
1795 have so far, don't perform any peeling. */
1796 if (nopeel_inside_cost <= best_peel.inside_cost)
1797 do_peeling = false;
1800 if (do_peeling)
1802 stmt = DR_STMT (dr0);
1803 stmt_info = vinfo_for_stmt (stmt);
1804 vectype = STMT_VINFO_VECTYPE (stmt_info);
1805 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1807 if (known_alignment_for_access_p (dr0))
1809 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1810 size_zero_node) < 0;
1811 if (!npeel)
1813 /* Since it's known at compile time, compute the number of
1814 iterations in the peeled loop (the peeling factor) for use in
1815 updating DR_MISALIGNMENT values. The peeling factor is the
1816 vectorization factor minus the misalignment as an element
1817 count. */
1818 mis = DR_MISALIGNMENT (dr0);
1819 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1820 npeel = ((negative ? mis - nelements : nelements - mis)
1821 & (nelements - 1));
1824 /* For interleaved data access every iteration accesses all the
1825 members of the group, therefore we divide the number of iterations
1826 by the group size. */
1827 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1828 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1829 npeel /= GROUP_SIZE (stmt_info);
1831 if (dump_enabled_p ())
1832 dump_printf_loc (MSG_NOTE, vect_location,
1833 "Try peeling by %d\n", npeel);
1836 /* Ensure that all datarefs can be vectorized after the peel. */
1837 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1838 do_peeling = false;
1840 /* Check if all datarefs are supportable and log. */
1841 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1843 stat = vect_verify_datarefs_alignment (loop_vinfo);
1844 if (!stat)
1845 do_peeling = false;
1846 else
1848 body_cost_vec.release ();
1849 return stat;
1853 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1854 if (do_peeling)
1856 unsigned max_allowed_peel
1857 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1858 if (max_allowed_peel != (unsigned)-1)
1860 unsigned max_peel = npeel;
1861 if (max_peel == 0)
1863 gimple *dr_stmt = DR_STMT (dr0);
1864 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1865 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1866 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1868 if (max_peel > max_allowed_peel)
1870 do_peeling = false;
1871 if (dump_enabled_p ())
1872 dump_printf_loc (MSG_NOTE, vect_location,
1873 "Disable peeling, max peels reached: %d\n", max_peel);
1878 /* Cost model #2 - if peeling may result in a remaining loop not
1879 iterating enough to be vectorized then do not peel. */
1880 if (do_peeling
1881 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1883 unsigned max_peel
1884 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1885 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1886 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1887 do_peeling = false;
1890 if (do_peeling)
1892 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1893 If the misalignment of DR_i is identical to that of dr0 then set
1894 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1895 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1896 by the peeling factor times the element size of DR_i (MOD the
1897 vectorization factor times the size). Otherwise, the
1898 misalignment of DR_i must be set to unknown. */
1899 FOR_EACH_VEC_ELT (datarefs, i, dr)
1900 if (dr != dr0)
1902 /* Strided accesses perform only component accesses, alignment
1903 is irrelevant for them. */
1904 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1905 if (STMT_VINFO_STRIDED_P (stmt_info)
1906 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1907 continue;
1909 vect_update_misalignment_for_peel (dr, dr0, npeel);
1912 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1913 if (npeel)
1914 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1915 else
1916 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1917 = DR_MISALIGNMENT (dr0);
1918 SET_DR_MISALIGNMENT (dr0, 0);
1919 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location,
1922 "Alignment of access forced using peeling.\n");
1923 dump_printf_loc (MSG_NOTE, vect_location,
1924 "Peeling for alignment will be applied.\n");
1926 /* The inside-loop cost will be accounted for in vectorizable_load
1927 and vectorizable_store correctly with adjusted alignments.
1928 Drop the body_cst_vec on the floor here. */
1929 body_cost_vec.release ();
1931 stat = vect_verify_datarefs_alignment (loop_vinfo);
1932 gcc_assert (stat);
1933 return stat;
1937 body_cost_vec.release ();
1939 /* (2) Versioning to force alignment. */
1941 /* Try versioning if:
1942 1) optimize loop for speed
1943 2) there is at least one unsupported misaligned data ref with an unknown
1944 misalignment, and
1945 3) all misaligned data refs with a known misalignment are supported, and
1946 4) the number of runtime alignment checks is within reason. */
1948 do_versioning =
1949 optimize_loop_nest_for_speed_p (loop)
1950 && (!loop->inner); /* FORNOW */
1952 if (do_versioning)
1954 FOR_EACH_VEC_ELT (datarefs, i, dr)
1956 stmt = DR_STMT (dr);
1957 stmt_info = vinfo_for_stmt (stmt);
1959 /* For interleaving, only the alignment of the first access
1960 matters. */
1961 if (aligned_access_p (dr)
1962 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1963 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1964 continue;
1966 if (STMT_VINFO_STRIDED_P (stmt_info))
1968 /* Strided loads perform only component accesses, alignment is
1969 irrelevant for them. */
1970 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1971 continue;
1972 do_versioning = false;
1973 break;
1976 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1978 if (!supportable_dr_alignment)
1980 gimple *stmt;
1981 int mask;
1982 tree vectype;
1984 if (known_alignment_for_access_p (dr)
1985 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1986 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1988 do_versioning = false;
1989 break;
1992 stmt = DR_STMT (dr);
1993 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1994 gcc_assert (vectype);
1996 /* The rightmost bits of an aligned address must be zeros.
1997 Construct the mask needed for this test. For example,
1998 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1999 mask must be 15 = 0xf. */
2000 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2002 /* FORNOW: use the same mask to test all potentially unaligned
2003 references in the loop. The vectorizer currently supports
2004 a single vector size, see the reference to
2005 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2006 vectorization factor is computed. */
2007 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2008 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2009 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2010 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2011 DR_STMT (dr));
2015 /* Versioning requires at least one misaligned data reference. */
2016 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2017 do_versioning = false;
2018 else if (!do_versioning)
2019 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2022 if (do_versioning)
2024 vec<gimple *> may_misalign_stmts
2025 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2026 gimple *stmt;
2028 /* It can now be assumed that the data references in the statements
2029 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2030 of the loop being vectorized. */
2031 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2033 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2034 dr = STMT_VINFO_DATA_REF (stmt_info);
2035 SET_DR_MISALIGNMENT (dr, 0);
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_NOTE, vect_location,
2038 "Alignment of access forced using versioning.\n");
2041 if (dump_enabled_p ())
2042 dump_printf_loc (MSG_NOTE, vect_location,
2043 "Versioning for alignment will be applied.\n");
2045 /* Peeling and versioning can't be done together at this time. */
2046 gcc_assert (! (do_peeling && do_versioning));
2048 stat = vect_verify_datarefs_alignment (loop_vinfo);
2049 gcc_assert (stat);
2050 return stat;
2053 /* This point is reached if neither peeling nor versioning is being done. */
2054 gcc_assert (! (do_peeling || do_versioning));
2056 stat = vect_verify_datarefs_alignment (loop_vinfo);
2057 return stat;
2061 /* Function vect_find_same_alignment_drs.
2063 Update group and alignment relations according to the chosen
2064 vectorization factor. */
2066 static void
2067 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2069 struct data_reference *dra = DDR_A (ddr);
2070 struct data_reference *drb = DDR_B (ddr);
2071 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2072 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2074 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2075 return;
2077 if (dra == drb)
2078 return;
2080 if (!operand_equal_p (DR_BASE_OBJECT (dra), DR_BASE_OBJECT (drb),
2081 OEP_ADDRESS_OF)
2082 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2083 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2084 return;
2086 /* Two references with distance zero have the same alignment. */
2087 offset_int diff = (wi::to_offset (DR_INIT (dra))
2088 - wi::to_offset (DR_INIT (drb)));
2089 if (diff != 0)
2091 /* Get the wider of the two alignments. */
2092 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2093 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2094 unsigned int max_align = MAX (align_a, align_b);
2096 /* Require the gap to be a multiple of the larger vector alignment. */
2097 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2098 return;
2101 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2102 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2103 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_NOTE, vect_location,
2106 "accesses have the same alignment: ");
2107 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2108 dump_printf (MSG_NOTE, " and ");
2109 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2110 dump_printf (MSG_NOTE, "\n");
2115 /* Function vect_analyze_data_refs_alignment
2117 Analyze the alignment of the data-references in the loop.
2118 Return FALSE if a data reference is found that cannot be vectorized. */
2120 bool
2121 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_NOTE, vect_location,
2125 "=== vect_analyze_data_refs_alignment ===\n");
2127 /* Mark groups of data references with same alignment using
2128 data dependence information. */
2129 vec<ddr_p> ddrs = vinfo->ddrs;
2130 struct data_dependence_relation *ddr;
2131 unsigned int i;
2133 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2134 vect_find_same_alignment_drs (ddr);
2136 vec<data_reference_p> datarefs = vinfo->datarefs;
2137 struct data_reference *dr;
2139 FOR_EACH_VEC_ELT (datarefs, i, dr)
2141 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2142 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2143 && !vect_compute_data_ref_alignment (dr))
2145 /* Strided accesses perform only component accesses, misalignment
2146 information is irrelevant for them. */
2147 if (STMT_VINFO_STRIDED_P (stmt_info)
2148 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2149 continue;
2151 if (dump_enabled_p ())
2152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153 "not vectorized: can't calculate alignment "
2154 "for data ref.\n");
2156 return false;
2160 return true;
2164 /* Analyze alignment of DRs of stmts in NODE. */
2166 static bool
2167 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2169 /* We vectorize from the first scalar stmt in the node unless
2170 the node is permuted in which case we start from the first
2171 element in the group. */
2172 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2173 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2174 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2175 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2177 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2178 if (! vect_compute_data_ref_alignment (dr)
2179 /* For creating the data-ref pointer we need alignment of the
2180 first element anyway. */
2181 || (dr != first_dr
2182 && ! vect_compute_data_ref_alignment (first_dr))
2183 || ! verify_data_ref_alignment (dr))
2185 if (dump_enabled_p ())
2186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2187 "not vectorized: bad data alignment in basic "
2188 "block.\n");
2189 return false;
2192 return true;
2195 /* Function vect_slp_analyze_instance_alignment
2197 Analyze the alignment of the data-references in the SLP instance.
2198 Return FALSE if a data reference is found that cannot be vectorized. */
2200 bool
2201 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2203 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_NOTE, vect_location,
2205 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2207 slp_tree node;
2208 unsigned i;
2209 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2210 if (! vect_slp_analyze_and_verify_node_alignment (node))
2211 return false;
2213 node = SLP_INSTANCE_TREE (instance);
2214 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2215 && ! vect_slp_analyze_and_verify_node_alignment
2216 (SLP_INSTANCE_TREE (instance)))
2217 return false;
2219 return true;
2223 /* Analyze groups of accesses: check that DR belongs to a group of
2224 accesses of legal size, step, etc. Detect gaps, single element
2225 interleaving, and other special cases. Set grouped access info.
2226 Collect groups of strided stores for further use in SLP analysis.
2227 Worker for vect_analyze_group_access. */
2229 static bool
2230 vect_analyze_group_access_1 (struct data_reference *dr)
2232 tree step = DR_STEP (dr);
2233 tree scalar_type = TREE_TYPE (DR_REF (dr));
2234 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2235 gimple *stmt = DR_STMT (dr);
2236 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2237 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2238 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2239 HOST_WIDE_INT dr_step = -1;
2240 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2241 bool slp_impossible = false;
2243 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2244 size of the interleaving group (including gaps). */
2245 if (tree_fits_shwi_p (step))
2247 dr_step = tree_to_shwi (step);
2248 /* Check that STEP is a multiple of type size. Otherwise there is
2249 a non-element-sized gap at the end of the group which we
2250 cannot represent in GROUP_GAP or GROUP_SIZE.
2251 ??? As we can handle non-constant step fine here we should
2252 simply remove uses of GROUP_GAP between the last and first
2253 element and instead rely on DR_STEP. GROUP_SIZE then would
2254 simply not include that gap. */
2255 if ((dr_step % type_size) != 0)
2257 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_NOTE, vect_location,
2260 "Step ");
2261 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2262 dump_printf (MSG_NOTE,
2263 " is not a multiple of the element size for ");
2264 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2265 dump_printf (MSG_NOTE, "\n");
2267 return false;
2269 groupsize = absu_hwi (dr_step) / type_size;
2271 else
2272 groupsize = 0;
2274 /* Not consecutive access is possible only if it is a part of interleaving. */
2275 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2277 /* Check if it this DR is a part of interleaving, and is a single
2278 element of the group that is accessed in the loop. */
2280 /* Gaps are supported only for loads. STEP must be a multiple of the type
2281 size. The size of the group must be a power of 2. */
2282 if (DR_IS_READ (dr)
2283 && (dr_step % type_size) == 0
2284 && groupsize > 0
2285 && pow2p_hwi (groupsize))
2287 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2288 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2289 GROUP_GAP (stmt_info) = groupsize - 1;
2290 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_NOTE, vect_location,
2293 "Detected single element interleaving ");
2294 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2295 dump_printf (MSG_NOTE, " step ");
2296 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2297 dump_printf (MSG_NOTE, "\n");
2300 return true;
2303 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306 "not consecutive access ");
2307 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2310 if (bb_vinfo)
2312 /* Mark the statement as unvectorizable. */
2313 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2314 return true;
2317 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2318 STMT_VINFO_STRIDED_P (stmt_info) = true;
2319 return true;
2322 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2324 /* First stmt in the interleaving chain. Check the chain. */
2325 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2326 struct data_reference *data_ref = dr;
2327 unsigned int count = 1;
2328 tree prev_init = DR_INIT (data_ref);
2329 gimple *prev = stmt;
2330 HOST_WIDE_INT diff, gaps = 0;
2332 while (next)
2334 /* Skip same data-refs. In case that two or more stmts share
2335 data-ref (supported only for loads), we vectorize only the first
2336 stmt, and the rest get their vectorized loads from the first
2337 one. */
2338 if (!tree_int_cst_compare (DR_INIT (data_ref),
2339 DR_INIT (STMT_VINFO_DATA_REF (
2340 vinfo_for_stmt (next)))))
2342 if (DR_IS_WRITE (data_ref))
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "Two store stmts share the same dr.\n");
2347 return false;
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352 "Two or more load stmts share the same dr.\n");
2354 /* For load use the same data-ref load. */
2355 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2357 prev = next;
2358 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2359 continue;
2362 prev = next;
2363 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2365 /* All group members have the same STEP by construction. */
2366 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2368 /* Check that the distance between two accesses is equal to the type
2369 size. Otherwise, we have gaps. */
2370 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2371 - TREE_INT_CST_LOW (prev_init)) / type_size;
2372 if (diff != 1)
2374 /* FORNOW: SLP of accesses with gaps is not supported. */
2375 slp_impossible = true;
2376 if (DR_IS_WRITE (data_ref))
2378 if (dump_enabled_p ())
2379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2380 "interleaved store with gaps\n");
2381 return false;
2384 gaps += diff - 1;
2387 last_accessed_element += diff;
2389 /* Store the gap from the previous member of the group. If there is no
2390 gap in the access, GROUP_GAP is always 1. */
2391 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2393 prev_init = DR_INIT (data_ref);
2394 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2395 /* Count the number of data-refs in the chain. */
2396 count++;
2399 if (groupsize == 0)
2400 groupsize = count + gaps;
2402 /* This could be UINT_MAX but as we are generating code in a very
2403 inefficient way we have to cap earlier. See PR78699 for example. */
2404 if (groupsize > 4096)
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408 "group is too large\n");
2409 return false;
2412 /* Check that the size of the interleaving is equal to count for stores,
2413 i.e., that there are no gaps. */
2414 if (groupsize != count
2415 && !DR_IS_READ (dr))
2417 if (dump_enabled_p ())
2418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2419 "interleaved store with gaps\n");
2420 return false;
2423 /* If there is a gap after the last load in the group it is the
2424 difference between the groupsize and the last accessed
2425 element.
2426 When there is no gap, this difference should be 0. */
2427 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2429 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2430 if (dump_enabled_p ())
2432 dump_printf_loc (MSG_NOTE, vect_location,
2433 "Detected interleaving ");
2434 if (DR_IS_READ (dr))
2435 dump_printf (MSG_NOTE, "load ");
2436 else
2437 dump_printf (MSG_NOTE, "store ");
2438 dump_printf (MSG_NOTE, "of size %u starting with ",
2439 (unsigned)groupsize);
2440 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2441 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2442 dump_printf_loc (MSG_NOTE, vect_location,
2443 "There is a gap of %u elements after the group\n",
2444 GROUP_GAP (vinfo_for_stmt (stmt)));
2447 /* SLP: create an SLP data structure for every interleaving group of
2448 stores for further analysis in vect_analyse_slp. */
2449 if (DR_IS_WRITE (dr) && !slp_impossible)
2451 if (loop_vinfo)
2452 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2453 if (bb_vinfo)
2454 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2458 return true;
2461 /* Analyze groups of accesses: check that DR belongs to a group of
2462 accesses of legal size, step, etc. Detect gaps, single element
2463 interleaving, and other special cases. Set grouped access info.
2464 Collect groups of strided stores for further use in SLP analysis. */
2466 static bool
2467 vect_analyze_group_access (struct data_reference *dr)
2469 if (!vect_analyze_group_access_1 (dr))
2471 /* Dissolve the group if present. */
2472 gimple *next;
2473 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2474 while (stmt)
2476 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2477 next = GROUP_NEXT_ELEMENT (vinfo);
2478 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2479 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2480 stmt = next;
2482 return false;
2484 return true;
2487 /* Analyze the access pattern of the data-reference DR.
2488 In case of non-consecutive accesses call vect_analyze_group_access() to
2489 analyze groups of accesses. */
2491 static bool
2492 vect_analyze_data_ref_access (struct data_reference *dr)
2494 tree step = DR_STEP (dr);
2495 tree scalar_type = TREE_TYPE (DR_REF (dr));
2496 gimple *stmt = DR_STMT (dr);
2497 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2498 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2499 struct loop *loop = NULL;
2501 if (loop_vinfo)
2502 loop = LOOP_VINFO_LOOP (loop_vinfo);
2504 if (loop_vinfo && !step)
2506 if (dump_enabled_p ())
2507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2508 "bad data-ref access in loop\n");
2509 return false;
2512 /* Allow loads with zero step in inner-loop vectorization. */
2513 if (loop_vinfo && integer_zerop (step))
2515 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2516 if (!nested_in_vect_loop_p (loop, stmt))
2517 return DR_IS_READ (dr);
2518 /* Allow references with zero step for outer loops marked
2519 with pragma omp simd only - it guarantees absence of
2520 loop-carried dependencies between inner loop iterations. */
2521 if (!loop->force_vectorize)
2523 if (dump_enabled_p ())
2524 dump_printf_loc (MSG_NOTE, vect_location,
2525 "zero step in inner loop of nest\n");
2526 return false;
2530 if (loop && nested_in_vect_loop_p (loop, stmt))
2532 /* Interleaved accesses are not yet supported within outer-loop
2533 vectorization for references in the inner-loop. */
2534 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2536 /* For the rest of the analysis we use the outer-loop step. */
2537 step = STMT_VINFO_DR_STEP (stmt_info);
2538 if (integer_zerop (step))
2540 if (dump_enabled_p ())
2541 dump_printf_loc (MSG_NOTE, vect_location,
2542 "zero step in outer loop.\n");
2543 return DR_IS_READ (dr);
2547 /* Consecutive? */
2548 if (TREE_CODE (step) == INTEGER_CST)
2550 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2551 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2552 || (dr_step < 0
2553 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2555 /* Mark that it is not interleaving. */
2556 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2557 return true;
2561 if (loop && nested_in_vect_loop_p (loop, stmt))
2563 if (dump_enabled_p ())
2564 dump_printf_loc (MSG_NOTE, vect_location,
2565 "grouped access in outer loop.\n");
2566 return false;
2570 /* Assume this is a DR handled by non-constant strided load case. */
2571 if (TREE_CODE (step) != INTEGER_CST)
2572 return (STMT_VINFO_STRIDED_P (stmt_info)
2573 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2574 || vect_analyze_group_access (dr)));
2576 /* Not consecutive access - check if it's a part of interleaving group. */
2577 return vect_analyze_group_access (dr);
2580 /* Compare two data-references DRA and DRB to group them into chunks
2581 suitable for grouping. */
2583 static int
2584 dr_group_sort_cmp (const void *dra_, const void *drb_)
2586 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2587 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2588 int cmp;
2590 /* Stabilize sort. */
2591 if (dra == drb)
2592 return 0;
2594 /* DRs in different loops never belong to the same group. */
2595 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2596 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2597 if (loopa != loopb)
2598 return loopa->num < loopb->num ? -1 : 1;
2600 /* Ordering of DRs according to base. */
2601 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2603 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2604 DR_BASE_ADDRESS (drb));
2605 if (cmp != 0)
2606 return cmp;
2609 /* And according to DR_OFFSET. */
2610 if (!dr_equal_offsets_p (dra, drb))
2612 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2613 if (cmp != 0)
2614 return cmp;
2617 /* Put reads before writes. */
2618 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2619 return DR_IS_READ (dra) ? -1 : 1;
2621 /* Then sort after access size. */
2622 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2623 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2625 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2626 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2627 if (cmp != 0)
2628 return cmp;
2631 /* And after step. */
2632 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2634 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2635 if (cmp != 0)
2636 return cmp;
2639 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2640 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2641 if (cmp == 0)
2642 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2643 return cmp;
2646 /* Function vect_analyze_data_ref_accesses.
2648 Analyze the access pattern of all the data references in the loop.
2650 FORNOW: the only access pattern that is considered vectorizable is a
2651 simple step 1 (consecutive) access.
2653 FORNOW: handle only arrays and pointer accesses. */
2655 bool
2656 vect_analyze_data_ref_accesses (vec_info *vinfo)
2658 unsigned int i;
2659 vec<data_reference_p> datarefs = vinfo->datarefs;
2660 struct data_reference *dr;
2662 if (dump_enabled_p ())
2663 dump_printf_loc (MSG_NOTE, vect_location,
2664 "=== vect_analyze_data_ref_accesses ===\n");
2666 if (datarefs.is_empty ())
2667 return true;
2669 /* Sort the array of datarefs to make building the interleaving chains
2670 linear. Don't modify the original vector's order, it is needed for
2671 determining what dependencies are reversed. */
2672 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2673 datarefs_copy.qsort (dr_group_sort_cmp);
2675 /* Build the interleaving chains. */
2676 for (i = 0; i < datarefs_copy.length () - 1;)
2678 data_reference_p dra = datarefs_copy[i];
2679 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2680 stmt_vec_info lastinfo = NULL;
2681 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2683 ++i;
2684 continue;
2686 for (i = i + 1; i < datarefs_copy.length (); ++i)
2688 data_reference_p drb = datarefs_copy[i];
2689 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2690 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2691 break;
2693 /* ??? Imperfect sorting (non-compatible types, non-modulo
2694 accesses, same accesses) can lead to a group to be artificially
2695 split here as we don't just skip over those. If it really
2696 matters we can push those to a worklist and re-iterate
2697 over them. The we can just skip ahead to the next DR here. */
2699 /* DRs in a different loop should not be put into the same
2700 interleaving group. */
2701 if (gimple_bb (DR_STMT (dra))->loop_father
2702 != gimple_bb (DR_STMT (drb))->loop_father)
2703 break;
2705 /* Check that the data-refs have same first location (except init)
2706 and they are both either store or load (not load and store,
2707 not masked loads or stores). */
2708 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2709 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2710 DR_BASE_ADDRESS (drb), 0)
2711 || !dr_equal_offsets_p (dra, drb)
2712 || !gimple_assign_single_p (DR_STMT (dra))
2713 || !gimple_assign_single_p (DR_STMT (drb)))
2714 break;
2716 /* Check that the data-refs have the same constant size. */
2717 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2718 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2719 if (!tree_fits_uhwi_p (sza)
2720 || !tree_fits_uhwi_p (szb)
2721 || !tree_int_cst_equal (sza, szb))
2722 break;
2724 /* Check that the data-refs have the same step. */
2725 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2726 break;
2728 /* Do not place the same access in the interleaving chain twice. */
2729 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2730 break;
2732 /* Check the types are compatible.
2733 ??? We don't distinguish this during sorting. */
2734 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2735 TREE_TYPE (DR_REF (drb))))
2736 break;
2738 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2739 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2740 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2741 gcc_assert (init_a <= init_b);
2743 /* If init_b == init_a + the size of the type * k, we have an
2744 interleaving, and DRA is accessed before DRB. */
2745 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2746 if (type_size_a == 0
2747 || (init_b - init_a) % type_size_a != 0)
2748 break;
2750 /* If we have a store, the accesses are adjacent. This splits
2751 groups into chunks we support (we don't support vectorization
2752 of stores with gaps). */
2753 if (!DR_IS_READ (dra)
2754 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2755 (DR_INIT (datarefs_copy[i-1]))
2756 != type_size_a))
2757 break;
2759 /* If the step (if not zero or non-constant) is greater than the
2760 difference between data-refs' inits this splits groups into
2761 suitable sizes. */
2762 if (tree_fits_shwi_p (DR_STEP (dra)))
2764 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2765 if (step != 0 && step <= (init_b - init_a))
2766 break;
2769 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE, vect_location,
2772 "Detected interleaving ");
2773 if (DR_IS_READ (dra))
2774 dump_printf (MSG_NOTE, "load ");
2775 else
2776 dump_printf (MSG_NOTE, "store ");
2777 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2778 dump_printf (MSG_NOTE, " and ");
2779 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2780 dump_printf (MSG_NOTE, "\n");
2783 /* Link the found element into the group list. */
2784 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2786 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2787 lastinfo = stmtinfo_a;
2789 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2790 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2791 lastinfo = stmtinfo_b;
2795 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2796 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2797 && !vect_analyze_data_ref_access (dr))
2799 if (dump_enabled_p ())
2800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801 "not vectorized: complicated access pattern.\n");
2803 if (is_a <bb_vec_info> (vinfo))
2805 /* Mark the statement as not vectorizable. */
2806 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2807 continue;
2809 else
2811 datarefs_copy.release ();
2812 return false;
2816 datarefs_copy.release ();
2817 return true;
2820 /* Function vect_vfa_segment_size.
2822 Create an expression that computes the size of segment
2823 that will be accessed for a data reference. The functions takes into
2824 account that realignment loads may access one more vector.
2826 Input:
2827 DR: The data reference.
2828 LENGTH_FACTOR: segment length to consider.
2830 Return an expression whose value is the size of segment which will be
2831 accessed by DR. */
2833 static tree
2834 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2836 tree segment_length;
2838 if (integer_zerop (DR_STEP (dr)))
2839 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2840 else
2841 segment_length = size_binop (MULT_EXPR,
2842 fold_convert (sizetype, DR_STEP (dr)),
2843 fold_convert (sizetype, length_factor));
2845 if (vect_supportable_dr_alignment (dr, false)
2846 == dr_explicit_realign_optimized)
2848 tree vector_size = TYPE_SIZE_UNIT
2849 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2851 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2853 return segment_length;
2856 /* Function vect_no_alias_p.
2858 Given data references A and B with equal base and offset, the alias
2859 relation can be decided at compilation time, return TRUE if they do
2860 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2861 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2862 respectively. */
2864 static bool
2865 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2866 tree segment_length_a, tree segment_length_b)
2868 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2869 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2870 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2871 return false;
2873 tree seg_a_min = DR_INIT (a);
2874 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2875 seg_a_min, segment_length_a);
2876 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2877 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2878 [a, a+12) */
2879 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2881 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2882 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2883 seg_a_max, unit_size);
2884 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2885 DR_INIT (a), unit_size);
2887 tree seg_b_min = DR_INIT (b);
2888 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2889 seg_b_min, segment_length_b);
2890 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2892 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2893 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2894 seg_b_max, unit_size);
2895 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2896 DR_INIT (b), unit_size);
2899 if (tree_int_cst_le (seg_a_max, seg_b_min)
2900 || tree_int_cst_le (seg_b_max, seg_a_min))
2901 return true;
2903 return false;
2906 /* Function vect_prune_runtime_alias_test_list.
2908 Prune a list of ddrs to be tested at run-time by versioning for alias.
2909 Merge several alias checks into one if possible.
2910 Return FALSE if resulting list of ddrs is longer then allowed by
2911 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2913 bool
2914 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2916 vec<ddr_p> may_alias_ddrs =
2917 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2918 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2919 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2920 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2921 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2923 ddr_p ddr;
2924 unsigned int i;
2925 tree length_factor;
2927 if (dump_enabled_p ())
2928 dump_printf_loc (MSG_NOTE, vect_location,
2929 "=== vect_prune_runtime_alias_test_list ===\n");
2931 if (may_alias_ddrs.is_empty ())
2932 return true;
2934 comp_alias_ddrs.create (may_alias_ddrs.length ());
2936 /* First, we collect all data ref pairs for aliasing checks. */
2937 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2939 int comp_res;
2940 struct data_reference *dr_a, *dr_b;
2941 gimple *dr_group_first_a, *dr_group_first_b;
2942 tree segment_length_a, segment_length_b;
2943 gimple *stmt_a, *stmt_b;
2945 dr_a = DDR_A (ddr);
2946 stmt_a = DR_STMT (DDR_A (ddr));
2947 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2948 if (dr_group_first_a)
2950 stmt_a = dr_group_first_a;
2951 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2954 dr_b = DDR_B (ddr);
2955 stmt_b = DR_STMT (DDR_B (ddr));
2956 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2957 if (dr_group_first_b)
2959 stmt_b = dr_group_first_b;
2960 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2963 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2964 length_factor = scalar_loop_iters;
2965 else
2966 length_factor = size_int (vect_factor);
2967 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2968 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2970 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2971 DR_BASE_ADDRESS (dr_b));
2972 if (comp_res == 0)
2973 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
2974 DR_OFFSET (dr_b));
2976 /* Alias is known at compilation time. */
2977 if (comp_res == 0
2978 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
2979 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
2980 && TREE_CODE (segment_length_a) == INTEGER_CST
2981 && TREE_CODE (segment_length_b) == INTEGER_CST)
2983 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
2984 continue;
2986 if (dump_enabled_p ())
2987 dump_printf_loc (MSG_NOTE, vect_location,
2988 "not vectorized: compilation time alias.\n");
2990 return false;
2993 dr_with_seg_len_pair_t dr_with_seg_len_pair
2994 (dr_with_seg_len (dr_a, segment_length_a),
2995 dr_with_seg_len (dr_b, segment_length_b));
2997 /* Canonicalize pairs by sorting the two DR members. */
2998 if (comp_res > 0)
2999 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3001 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3004 prune_runtime_alias_test_list (&comp_alias_ddrs,
3005 (unsigned HOST_WIDE_INT) vect_factor);
3006 dump_printf_loc (MSG_NOTE, vect_location,
3007 "improved number of alias checks from %d to %d\n",
3008 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3009 if ((int) comp_alias_ddrs.length () >
3010 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3012 if (dump_enabled_p ())
3013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014 "number of versioning for alias "
3015 "run-time tests exceeds %d "
3016 "(--param vect-max-version-for-alias-checks)\n",
3017 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3018 return false;
3021 /* All alias checks have been resolved at compilation time. */
3022 if (!comp_alias_ddrs.length ())
3023 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3025 return true;
3028 /* Return true if a non-affine read or write in STMT is suitable for a
3029 gather load or scatter store. Describe the operation in *INFO if so. */
3031 bool
3032 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3033 gather_scatter_info *info)
3035 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3036 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3037 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3038 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3039 tree offtype = NULL_TREE;
3040 tree decl, base, off;
3041 machine_mode pmode;
3042 int punsignedp, reversep, pvolatilep = 0;
3044 base = DR_REF (dr);
3045 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3046 see if we can use the def stmt of the address. */
3047 if (is_gimple_call (stmt)
3048 && gimple_call_internal_p (stmt)
3049 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3050 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3051 && TREE_CODE (base) == MEM_REF
3052 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3053 && integer_zerop (TREE_OPERAND (base, 1))
3054 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3056 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3057 if (is_gimple_assign (def_stmt)
3058 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3059 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3062 /* The gather and scatter builtins need address of the form
3063 loop_invariant + vector * {1, 2, 4, 8}
3065 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3066 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3067 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3068 multiplications and additions in it. To get a vector, we need
3069 a single SSA_NAME that will be defined in the loop and will
3070 contain everything that is not loop invariant and that can be
3071 vectorized. The following code attempts to find such a preexistng
3072 SSA_NAME OFF and put the loop invariants into a tree BASE
3073 that can be gimplified before the loop. */
3074 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3075 &punsignedp, &reversep, &pvolatilep);
3076 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3078 if (TREE_CODE (base) == MEM_REF)
3080 if (!integer_zerop (TREE_OPERAND (base, 1)))
3082 if (off == NULL_TREE)
3084 offset_int moff = mem_ref_offset (base);
3085 off = wide_int_to_tree (sizetype, moff);
3087 else
3088 off = size_binop (PLUS_EXPR, off,
3089 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3091 base = TREE_OPERAND (base, 0);
3093 else
3094 base = build_fold_addr_expr (base);
3096 if (off == NULL_TREE)
3097 off = size_zero_node;
3099 /* If base is not loop invariant, either off is 0, then we start with just
3100 the constant offset in the loop invariant BASE and continue with base
3101 as OFF, otherwise give up.
3102 We could handle that case by gimplifying the addition of base + off
3103 into some SSA_NAME and use that as off, but for now punt. */
3104 if (!expr_invariant_in_loop_p (loop, base))
3106 if (!integer_zerop (off))
3107 return false;
3108 off = base;
3109 base = size_int (pbitpos / BITS_PER_UNIT);
3111 /* Otherwise put base + constant offset into the loop invariant BASE
3112 and continue with OFF. */
3113 else
3115 base = fold_convert (sizetype, base);
3116 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3119 /* OFF at this point may be either a SSA_NAME or some tree expression
3120 from get_inner_reference. Try to peel off loop invariants from it
3121 into BASE as long as possible. */
3122 STRIP_NOPS (off);
3123 while (offtype == NULL_TREE)
3125 enum tree_code code;
3126 tree op0, op1, add = NULL_TREE;
3128 if (TREE_CODE (off) == SSA_NAME)
3130 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3132 if (expr_invariant_in_loop_p (loop, off))
3133 return false;
3135 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3136 break;
3138 op0 = gimple_assign_rhs1 (def_stmt);
3139 code = gimple_assign_rhs_code (def_stmt);
3140 op1 = gimple_assign_rhs2 (def_stmt);
3142 else
3144 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3145 return false;
3146 code = TREE_CODE (off);
3147 extract_ops_from_tree (off, &code, &op0, &op1);
3149 switch (code)
3151 case POINTER_PLUS_EXPR:
3152 case PLUS_EXPR:
3153 if (expr_invariant_in_loop_p (loop, op0))
3155 add = op0;
3156 off = op1;
3157 do_add:
3158 add = fold_convert (sizetype, add);
3159 if (scale != 1)
3160 add = size_binop (MULT_EXPR, add, size_int (scale));
3161 base = size_binop (PLUS_EXPR, base, add);
3162 continue;
3164 if (expr_invariant_in_loop_p (loop, op1))
3166 add = op1;
3167 off = op0;
3168 goto do_add;
3170 break;
3171 case MINUS_EXPR:
3172 if (expr_invariant_in_loop_p (loop, op1))
3174 add = fold_convert (sizetype, op1);
3175 add = size_binop (MINUS_EXPR, size_zero_node, add);
3176 off = op0;
3177 goto do_add;
3179 break;
3180 case MULT_EXPR:
3181 if (scale == 1 && tree_fits_shwi_p (op1))
3183 scale = tree_to_shwi (op1);
3184 off = op0;
3185 continue;
3187 break;
3188 case SSA_NAME:
3189 off = op0;
3190 continue;
3191 CASE_CONVERT:
3192 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3193 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3194 break;
3195 if (TYPE_PRECISION (TREE_TYPE (op0))
3196 == TYPE_PRECISION (TREE_TYPE (off)))
3198 off = op0;
3199 continue;
3201 if (TYPE_PRECISION (TREE_TYPE (op0))
3202 < TYPE_PRECISION (TREE_TYPE (off)))
3204 off = op0;
3205 offtype = TREE_TYPE (off);
3206 STRIP_NOPS (off);
3207 continue;
3209 break;
3210 default:
3211 break;
3213 break;
3216 /* If at the end OFF still isn't a SSA_NAME or isn't
3217 defined in the loop, punt. */
3218 if (TREE_CODE (off) != SSA_NAME
3219 || expr_invariant_in_loop_p (loop, off))
3220 return false;
3222 if (offtype == NULL_TREE)
3223 offtype = TREE_TYPE (off);
3225 if (DR_IS_READ (dr))
3226 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3227 offtype, scale);
3228 else
3229 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3230 offtype, scale);
3232 if (decl == NULL_TREE)
3233 return false;
3235 info->decl = decl;
3236 info->base = base;
3237 info->offset = off;
3238 info->offset_dt = vect_unknown_def_type;
3239 info->offset_vectype = NULL_TREE;
3240 info->scale = scale;
3241 return true;
3244 /* Function vect_analyze_data_refs.
3246 Find all the data references in the loop or basic block.
3248 The general structure of the analysis of data refs in the vectorizer is as
3249 follows:
3250 1- vect_analyze_data_refs(loop/bb): call
3251 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3252 in the loop/bb and their dependences.
3253 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3254 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3255 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3259 bool
3260 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3262 struct loop *loop = NULL;
3263 unsigned int i;
3264 struct data_reference *dr;
3265 tree scalar_type;
3267 if (dump_enabled_p ())
3268 dump_printf_loc (MSG_NOTE, vect_location,
3269 "=== vect_analyze_data_refs ===\n");
3271 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3272 loop = LOOP_VINFO_LOOP (loop_vinfo);
3274 /* Go through the data-refs, check that the analysis succeeded. Update
3275 pointer from stmt_vec_info struct to DR and vectype. */
3277 vec<data_reference_p> datarefs = vinfo->datarefs;
3278 FOR_EACH_VEC_ELT (datarefs, i, dr)
3280 gimple *stmt;
3281 stmt_vec_info stmt_info;
3282 tree base, offset, init;
3283 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3284 bool simd_lane_access = false;
3285 int vf;
3287 again:
3288 if (!dr || !DR_REF (dr))
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3292 "not vectorized: unhandled data-ref\n");
3293 return false;
3296 stmt = DR_STMT (dr);
3297 stmt_info = vinfo_for_stmt (stmt);
3299 /* Discard clobbers from the dataref vector. We will remove
3300 clobber stmts during vectorization. */
3301 if (gimple_clobber_p (stmt))
3303 free_data_ref (dr);
3304 if (i == datarefs.length () - 1)
3306 datarefs.pop ();
3307 break;
3309 datarefs.ordered_remove (i);
3310 dr = datarefs[i];
3311 goto again;
3314 /* Check that analysis of the data-ref succeeded. */
3315 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3316 || !DR_STEP (dr))
3318 bool maybe_gather
3319 = DR_IS_READ (dr)
3320 && !TREE_THIS_VOLATILE (DR_REF (dr))
3321 && targetm.vectorize.builtin_gather != NULL;
3322 bool maybe_scatter
3323 = DR_IS_WRITE (dr)
3324 && !TREE_THIS_VOLATILE (DR_REF (dr))
3325 && targetm.vectorize.builtin_scatter != NULL;
3326 bool maybe_simd_lane_access
3327 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3329 /* If target supports vector gather loads or scatter stores, or if
3330 this might be a SIMD lane access, see if they can't be used. */
3331 if (is_a <loop_vec_info> (vinfo)
3332 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3333 && !nested_in_vect_loop_p (loop, stmt))
3335 struct data_reference *newdr
3336 = create_data_ref (NULL, loop_containing_stmt (stmt),
3337 DR_REF (dr), stmt, maybe_scatter ? false : true);
3338 gcc_assert (newdr != NULL && DR_REF (newdr));
3339 if (DR_BASE_ADDRESS (newdr)
3340 && DR_OFFSET (newdr)
3341 && DR_INIT (newdr)
3342 && DR_STEP (newdr)
3343 && integer_zerop (DR_STEP (newdr)))
3345 if (maybe_simd_lane_access)
3347 tree off = DR_OFFSET (newdr);
3348 STRIP_NOPS (off);
3349 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3350 && TREE_CODE (off) == MULT_EXPR
3351 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3353 tree step = TREE_OPERAND (off, 1);
3354 off = TREE_OPERAND (off, 0);
3355 STRIP_NOPS (off);
3356 if (CONVERT_EXPR_P (off)
3357 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3358 0)))
3359 < TYPE_PRECISION (TREE_TYPE (off)))
3360 off = TREE_OPERAND (off, 0);
3361 if (TREE_CODE (off) == SSA_NAME)
3363 gimple *def = SSA_NAME_DEF_STMT (off);
3364 tree reft = TREE_TYPE (DR_REF (newdr));
3365 if (is_gimple_call (def)
3366 && gimple_call_internal_p (def)
3367 && (gimple_call_internal_fn (def)
3368 == IFN_GOMP_SIMD_LANE))
3370 tree arg = gimple_call_arg (def, 0);
3371 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3372 arg = SSA_NAME_VAR (arg);
3373 if (arg == loop->simduid
3374 /* For now. */
3375 && tree_int_cst_equal
3376 (TYPE_SIZE_UNIT (reft),
3377 step))
3379 DR_OFFSET (newdr) = ssize_int (0);
3380 DR_STEP (newdr) = step;
3381 DR_OFFSET_ALIGNMENT (newdr)
3382 = BIGGEST_ALIGNMENT;
3383 DR_STEP_ALIGNMENT (newdr)
3384 = highest_pow2_factor (step);
3385 dr = newdr;
3386 simd_lane_access = true;
3392 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3394 dr = newdr;
3395 if (maybe_gather)
3396 gatherscatter = GATHER;
3397 else
3398 gatherscatter = SCATTER;
3401 if (gatherscatter == SG_NONE && !simd_lane_access)
3402 free_data_ref (newdr);
3405 if (gatherscatter == SG_NONE && !simd_lane_access)
3407 if (dump_enabled_p ())
3409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3410 "not vectorized: data ref analysis "
3411 "failed ");
3412 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3415 if (is_a <bb_vec_info> (vinfo))
3416 break;
3418 return false;
3422 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3424 if (dump_enabled_p ())
3425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3426 "not vectorized: base addr of dr is a "
3427 "constant\n");
3429 if (is_a <bb_vec_info> (vinfo))
3430 break;
3432 if (gatherscatter != SG_NONE || simd_lane_access)
3433 free_data_ref (dr);
3434 return false;
3437 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3439 if (dump_enabled_p ())
3441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3442 "not vectorized: volatile type ");
3443 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3446 if (is_a <bb_vec_info> (vinfo))
3447 break;
3449 return false;
3452 if (stmt_can_throw_internal (stmt))
3454 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3457 "not vectorized: statement can throw an "
3458 "exception ");
3459 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3462 if (is_a <bb_vec_info> (vinfo))
3463 break;
3465 if (gatherscatter != SG_NONE || simd_lane_access)
3466 free_data_ref (dr);
3467 return false;
3470 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3471 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3473 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3476 "not vectorized: statement is bitfield "
3477 "access ");
3478 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3481 if (is_a <bb_vec_info> (vinfo))
3482 break;
3484 if (gatherscatter != SG_NONE || simd_lane_access)
3485 free_data_ref (dr);
3486 return false;
3489 base = unshare_expr (DR_BASE_ADDRESS (dr));
3490 offset = unshare_expr (DR_OFFSET (dr));
3491 init = unshare_expr (DR_INIT (dr));
3493 if (is_gimple_call (stmt)
3494 && (!gimple_call_internal_p (stmt)
3495 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3496 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3498 if (dump_enabled_p ())
3500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3501 "not vectorized: dr in a call ");
3502 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3505 if (is_a <bb_vec_info> (vinfo))
3506 break;
3508 if (gatherscatter != SG_NONE || simd_lane_access)
3509 free_data_ref (dr);
3510 return false;
3513 /* Update DR field in stmt_vec_info struct. */
3515 /* If the dataref is in an inner-loop of the loop that is considered for
3516 for vectorization, we also want to analyze the access relative to
3517 the outer-loop (DR contains information only relative to the
3518 inner-most enclosing loop). We do that by building a reference to the
3519 first location accessed by the inner-loop, and analyze it relative to
3520 the outer-loop. */
3521 if (loop && nested_in_vect_loop_p (loop, stmt))
3523 /* Build a reference to the first location accessed by the
3524 inner loop: *(BASE + INIT + OFFSET). By construction,
3525 this address must be invariant in the inner loop, so we
3526 can consider it as being used in the outer loop. */
3527 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3528 init, offset);
3529 tree init_addr = fold_build_pointer_plus (base, init_offset);
3530 tree init_ref = build_fold_indirect_ref (init_addr);
3532 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE, vect_location,
3535 "analyze in outer loop: ");
3536 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3537 dump_printf (MSG_NOTE, "\n");
3540 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3541 init_ref, loop))
3542 /* dr_analyze_innermost already explained the failure. */
3543 return false;
3545 if (dump_enabled_p ())
3547 dump_printf_loc (MSG_NOTE, vect_location,
3548 "\touter base_address: ");
3549 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3550 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3551 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3552 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3553 STMT_VINFO_DR_OFFSET (stmt_info));
3554 dump_printf (MSG_NOTE,
3555 "\n\touter constant offset from base address: ");
3556 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3557 STMT_VINFO_DR_INIT (stmt_info));
3558 dump_printf (MSG_NOTE, "\n\touter step: ");
3559 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3560 STMT_VINFO_DR_STEP (stmt_info));
3561 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3562 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3563 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3564 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3565 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3566 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3567 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3568 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3572 if (STMT_VINFO_DATA_REF (stmt_info))
3574 if (dump_enabled_p ())
3576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3577 "not vectorized: more than one data ref "
3578 "in stmt: ");
3579 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3582 if (is_a <bb_vec_info> (vinfo))
3583 break;
3585 if (gatherscatter != SG_NONE || simd_lane_access)
3586 free_data_ref (dr);
3587 return false;
3590 STMT_VINFO_DATA_REF (stmt_info) = dr;
3591 if (simd_lane_access)
3593 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3594 free_data_ref (datarefs[i]);
3595 datarefs[i] = dr;
3598 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3599 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3600 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3602 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3605 "not vectorized: base object not addressable "
3606 "for stmt: ");
3607 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3609 if (is_a <bb_vec_info> (vinfo))
3611 /* In BB vectorization the ref can still participate
3612 in dependence analysis, we just can't vectorize it. */
3613 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3614 continue;
3616 return false;
3619 /* Set vectype for STMT. */
3620 scalar_type = TREE_TYPE (DR_REF (dr));
3621 STMT_VINFO_VECTYPE (stmt_info)
3622 = get_vectype_for_scalar_type (scalar_type);
3623 if (!STMT_VINFO_VECTYPE (stmt_info))
3625 if (dump_enabled_p ())
3627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3628 "not vectorized: no vectype for stmt: ");
3629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3630 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3631 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3632 scalar_type);
3633 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3636 if (is_a <bb_vec_info> (vinfo))
3638 /* No vector type is fine, the ref can still participate
3639 in dependence analysis, we just can't vectorize it. */
3640 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3641 continue;
3644 if (gatherscatter != SG_NONE || simd_lane_access)
3646 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3647 if (gatherscatter != SG_NONE)
3648 free_data_ref (dr);
3650 return false;
3652 else
3654 if (dump_enabled_p ())
3656 dump_printf_loc (MSG_NOTE, vect_location,
3657 "got vectype for stmt: ");
3658 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3659 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660 STMT_VINFO_VECTYPE (stmt_info));
3661 dump_printf (MSG_NOTE, "\n");
3665 /* Adjust the minimal vectorization factor according to the
3666 vector type. */
3667 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3668 if (vf > *min_vf)
3669 *min_vf = vf;
3671 if (gatherscatter != SG_NONE)
3673 gather_scatter_info gs_info;
3674 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3675 &gs_info)
3676 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3678 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3679 free_data_ref (dr);
3680 if (dump_enabled_p ())
3682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3683 (gatherscatter == GATHER) ?
3684 "not vectorized: not suitable for gather "
3685 "load " :
3686 "not vectorized: not suitable for scatter "
3687 "store ");
3688 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3690 return false;
3693 free_data_ref (datarefs[i]);
3694 datarefs[i] = dr;
3695 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3698 else if (is_a <loop_vec_info> (vinfo)
3699 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3701 if (nested_in_vect_loop_p (loop, stmt))
3703 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706 "not vectorized: not suitable for strided "
3707 "load ");
3708 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3710 return false;
3712 STMT_VINFO_STRIDED_P (stmt_info) = true;
3716 /* If we stopped analysis at the first dataref we could not analyze
3717 when trying to vectorize a basic-block mark the rest of the datarefs
3718 as not vectorizable and truncate the vector of datarefs. That
3719 avoids spending useless time in analyzing their dependence. */
3720 if (i != datarefs.length ())
3722 gcc_assert (is_a <bb_vec_info> (vinfo));
3723 for (unsigned j = i; j < datarefs.length (); ++j)
3725 data_reference_p dr = datarefs[j];
3726 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3727 free_data_ref (dr);
3729 datarefs.truncate (i);
3732 return true;
3736 /* Function vect_get_new_vect_var.
3738 Returns a name for a new variable. The current naming scheme appends the
3739 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3740 the name of vectorizer generated variables, and appends that to NAME if
3741 provided. */
3743 tree
3744 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3746 const char *prefix;
3747 tree new_vect_var;
3749 switch (var_kind)
3751 case vect_simple_var:
3752 prefix = "vect";
3753 break;
3754 case vect_scalar_var:
3755 prefix = "stmp";
3756 break;
3757 case vect_mask_var:
3758 prefix = "mask";
3759 break;
3760 case vect_pointer_var:
3761 prefix = "vectp";
3762 break;
3763 default:
3764 gcc_unreachable ();
3767 if (name)
3769 char* tmp = concat (prefix, "_", name, NULL);
3770 new_vect_var = create_tmp_reg (type, tmp);
3771 free (tmp);
3773 else
3774 new_vect_var = create_tmp_reg (type, prefix);
3776 return new_vect_var;
3779 /* Like vect_get_new_vect_var but return an SSA name. */
3781 tree
3782 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3784 const char *prefix;
3785 tree new_vect_var;
3787 switch (var_kind)
3789 case vect_simple_var:
3790 prefix = "vect";
3791 break;
3792 case vect_scalar_var:
3793 prefix = "stmp";
3794 break;
3795 case vect_pointer_var:
3796 prefix = "vectp";
3797 break;
3798 default:
3799 gcc_unreachable ();
3802 if (name)
3804 char* tmp = concat (prefix, "_", name, NULL);
3805 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3806 free (tmp);
3808 else
3809 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3811 return new_vect_var;
3814 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3816 static void
3817 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3818 stmt_vec_info stmt_info)
3820 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3821 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3822 int misalign = DR_MISALIGNMENT (dr);
3823 if (misalign == DR_MISALIGNMENT_UNKNOWN)
3824 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3825 else
3826 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3829 /* Function vect_create_addr_base_for_vector_ref.
3831 Create an expression that computes the address of the first memory location
3832 that will be accessed for a data reference.
3834 Input:
3835 STMT: The statement containing the data reference.
3836 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3837 OFFSET: Optional. If supplied, it is be added to the initial address.
3838 LOOP: Specify relative to which loop-nest should the address be computed.
3839 For example, when the dataref is in an inner-loop nested in an
3840 outer-loop that is now being vectorized, LOOP can be either the
3841 outer-loop, or the inner-loop. The first memory location accessed
3842 by the following dataref ('in' points to short):
3844 for (i=0; i<N; i++)
3845 for (j=0; j<M; j++)
3846 s += in[i+j]
3848 is as follows:
3849 if LOOP=i_loop: &in (relative to i_loop)
3850 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3851 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3852 initial address. Unlike OFFSET, which is number of elements to
3853 be added, BYTE_OFFSET is measured in bytes.
3855 Output:
3856 1. Return an SSA_NAME whose value is the address of the memory location of
3857 the first vector of the data reference.
3858 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3859 these statement(s) which define the returned SSA_NAME.
3861 FORNOW: We are only handling array accesses with step 1. */
3863 tree
3864 vect_create_addr_base_for_vector_ref (gimple *stmt,
3865 gimple_seq *new_stmt_list,
3866 tree offset,
3867 tree byte_offset)
3869 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3870 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3871 const char *base_name;
3872 tree addr_base;
3873 tree dest;
3874 gimple_seq seq = NULL;
3875 tree vect_ptr_type;
3876 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3877 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3878 innermost_loop_behavior *drb = vect_dr_behavior (dr);
3880 tree data_ref_base = unshare_expr (drb->base_address);
3881 tree base_offset = unshare_expr (drb->offset);
3882 tree init = unshare_expr (drb->init);
3884 if (loop_vinfo)
3885 base_name = get_name (data_ref_base);
3886 else
3888 base_offset = ssize_int (0);
3889 init = ssize_int (0);
3890 base_name = get_name (DR_REF (dr));
3893 /* Create base_offset */
3894 base_offset = size_binop (PLUS_EXPR,
3895 fold_convert (sizetype, base_offset),
3896 fold_convert (sizetype, init));
3898 if (offset)
3900 offset = fold_build2 (MULT_EXPR, sizetype,
3901 fold_convert (sizetype, offset), step);
3902 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3903 base_offset, offset);
3905 if (byte_offset)
3907 byte_offset = fold_convert (sizetype, byte_offset);
3908 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3909 base_offset, byte_offset);
3912 /* base + base_offset */
3913 if (loop_vinfo)
3914 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3915 else
3917 addr_base = build1 (ADDR_EXPR,
3918 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3919 unshare_expr (DR_REF (dr)));
3922 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3923 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3924 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3925 gimple_seq_add_seq (new_stmt_list, seq);
3927 if (DR_PTR_INFO (dr)
3928 && TREE_CODE (addr_base) == SSA_NAME
3929 && !SSA_NAME_PTR_INFO (addr_base))
3931 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3932 if (offset || byte_offset)
3933 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3936 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3939 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3940 dump_printf (MSG_NOTE, "\n");
3943 return addr_base;
3947 /* Function vect_create_data_ref_ptr.
3949 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3950 location accessed in the loop by STMT, along with the def-use update
3951 chain to appropriately advance the pointer through the loop iterations.
3952 Also set aliasing information for the pointer. This pointer is used by
3953 the callers to this function to create a memory reference expression for
3954 vector load/store access.
3956 Input:
3957 1. STMT: a stmt that references memory. Expected to be of the form
3958 GIMPLE_ASSIGN <name, data-ref> or
3959 GIMPLE_ASSIGN <data-ref, name>.
3960 2. AGGR_TYPE: the type of the reference, which should be either a vector
3961 or an array.
3962 3. AT_LOOP: the loop where the vector memref is to be created.
3963 4. OFFSET (optional): an offset to be added to the initial address accessed
3964 by the data-ref in STMT.
3965 5. BSI: location where the new stmts are to be placed if there is no loop
3966 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3967 pointing to the initial address.
3968 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3969 to the initial address accessed by the data-ref in STMT. This is
3970 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3971 in bytes.
3973 Output:
3974 1. Declare a new ptr to vector_type, and have it point to the base of the
3975 data reference (initial addressed accessed by the data reference).
3976 For example, for vector of type V8HI, the following code is generated:
3978 v8hi *ap;
3979 ap = (v8hi *)initial_address;
3981 if OFFSET is not supplied:
3982 initial_address = &a[init];
3983 if OFFSET is supplied:
3984 initial_address = &a[init + OFFSET];
3985 if BYTE_OFFSET is supplied:
3986 initial_address = &a[init] + BYTE_OFFSET;
3988 Return the initial_address in INITIAL_ADDRESS.
3990 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3991 update the pointer in each iteration of the loop.
3993 Return the increment stmt that updates the pointer in PTR_INCR.
3995 3. Set INV_P to true if the access pattern of the data reference in the
3996 vectorized loop is invariant. Set it to false otherwise.
3998 4. Return the pointer. */
4000 tree
4001 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4002 tree offset, tree *initial_address,
4003 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4004 bool only_init, bool *inv_p, tree byte_offset)
4006 const char *base_name;
4007 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4008 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4009 struct loop *loop = NULL;
4010 bool nested_in_vect_loop = false;
4011 struct loop *containing_loop = NULL;
4012 tree aggr_ptr_type;
4013 tree aggr_ptr;
4014 tree new_temp;
4015 gimple_seq new_stmt_list = NULL;
4016 edge pe = NULL;
4017 basic_block new_bb;
4018 tree aggr_ptr_init;
4019 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4020 tree aptr;
4021 gimple_stmt_iterator incr_gsi;
4022 bool insert_after;
4023 tree indx_before_incr, indx_after_incr;
4024 gimple *incr;
4025 tree step;
4026 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4028 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4029 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4031 if (loop_vinfo)
4033 loop = LOOP_VINFO_LOOP (loop_vinfo);
4034 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4035 containing_loop = (gimple_bb (stmt))->loop_father;
4036 pe = loop_preheader_edge (loop);
4038 else
4040 gcc_assert (bb_vinfo);
4041 only_init = true;
4042 *ptr_incr = NULL;
4045 /* Check the step (evolution) of the load in LOOP, and record
4046 whether it's invariant. */
4047 step = vect_dr_behavior (dr)->step;
4048 if (integer_zerop (step))
4049 *inv_p = true;
4050 else
4051 *inv_p = false;
4053 /* Create an expression for the first address accessed by this load
4054 in LOOP. */
4055 base_name = get_name (DR_BASE_ADDRESS (dr));
4057 if (dump_enabled_p ())
4059 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4060 dump_printf_loc (MSG_NOTE, vect_location,
4061 "create %s-pointer variable to type: ",
4062 get_tree_code_name (TREE_CODE (aggr_type)));
4063 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4064 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4065 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4066 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4067 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4068 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4069 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4070 else
4071 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4072 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4073 dump_printf (MSG_NOTE, "\n");
4076 /* (1) Create the new aggregate-pointer variable.
4077 Vector and array types inherit the alias set of their component
4078 type by default so we need to use a ref-all pointer if the data
4079 reference does not conflict with the created aggregated data
4080 reference because it is not addressable. */
4081 bool need_ref_all = false;
4082 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4083 get_alias_set (DR_REF (dr))))
4084 need_ref_all = true;
4085 /* Likewise for any of the data references in the stmt group. */
4086 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4088 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4091 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4092 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4093 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4094 get_alias_set (DR_REF (sdr))))
4096 need_ref_all = true;
4097 break;
4099 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4101 while (orig_stmt);
4103 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4104 need_ref_all);
4105 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4108 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4109 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4110 def-use update cycles for the pointer: one relative to the outer-loop
4111 (LOOP), which is what steps (3) and (4) below do. The other is relative
4112 to the inner-loop (which is the inner-most loop containing the dataref),
4113 and this is done be step (5) below.
4115 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4116 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4117 redundant. Steps (3),(4) create the following:
4119 vp0 = &base_addr;
4120 LOOP: vp1 = phi(vp0,vp2)
4123 vp2 = vp1 + step
4124 goto LOOP
4126 If there is an inner-loop nested in loop, then step (5) will also be
4127 applied, and an additional update in the inner-loop will be created:
4129 vp0 = &base_addr;
4130 LOOP: vp1 = phi(vp0,vp2)
4132 inner: vp3 = phi(vp1,vp4)
4133 vp4 = vp3 + inner_step
4134 if () goto inner
4136 vp2 = vp1 + step
4137 if () goto LOOP */
4139 /* (2) Calculate the initial address of the aggregate-pointer, and set
4140 the aggregate-pointer to point to it before the loop. */
4142 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4144 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4145 offset, byte_offset);
4146 if (new_stmt_list)
4148 if (pe)
4150 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4151 gcc_assert (!new_bb);
4153 else
4154 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4157 *initial_address = new_temp;
4158 aggr_ptr_init = new_temp;
4160 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4161 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4162 inner-loop nested in LOOP (during outer-loop vectorization). */
4164 /* No update in loop is required. */
4165 if (only_init && (!loop_vinfo || at_loop == loop))
4166 aptr = aggr_ptr_init;
4167 else
4169 /* The step of the aggregate pointer is the type size. */
4170 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4171 /* One exception to the above is when the scalar step of the load in
4172 LOOP is zero. In this case the step here is also zero. */
4173 if (*inv_p)
4174 iv_step = size_zero_node;
4175 else if (tree_int_cst_sgn (step) == -1)
4176 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4178 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4180 create_iv (aggr_ptr_init,
4181 fold_convert (aggr_ptr_type, iv_step),
4182 aggr_ptr, loop, &incr_gsi, insert_after,
4183 &indx_before_incr, &indx_after_incr);
4184 incr = gsi_stmt (incr_gsi);
4185 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4187 /* Copy the points-to information if it exists. */
4188 if (DR_PTR_INFO (dr))
4190 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4191 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4193 if (ptr_incr)
4194 *ptr_incr = incr;
4196 aptr = indx_before_incr;
4199 if (!nested_in_vect_loop || only_init)
4200 return aptr;
4203 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4204 nested in LOOP, if exists. */
4206 gcc_assert (nested_in_vect_loop);
4207 if (!only_init)
4209 standard_iv_increment_position (containing_loop, &incr_gsi,
4210 &insert_after);
4211 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4212 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4213 &indx_after_incr);
4214 incr = gsi_stmt (incr_gsi);
4215 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4217 /* Copy the points-to information if it exists. */
4218 if (DR_PTR_INFO (dr))
4220 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4221 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4223 if (ptr_incr)
4224 *ptr_incr = incr;
4226 return indx_before_incr;
4228 else
4229 gcc_unreachable ();
4233 /* Function bump_vector_ptr
4235 Increment a pointer (to a vector type) by vector-size. If requested,
4236 i.e. if PTR-INCR is given, then also connect the new increment stmt
4237 to the existing def-use update-chain of the pointer, by modifying
4238 the PTR_INCR as illustrated below:
4240 The pointer def-use update-chain before this function:
4241 DATAREF_PTR = phi (p_0, p_2)
4242 ....
4243 PTR_INCR: p_2 = DATAREF_PTR + step
4245 The pointer def-use update-chain after this function:
4246 DATAREF_PTR = phi (p_0, p_2)
4247 ....
4248 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4249 ....
4250 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4252 Input:
4253 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4254 in the loop.
4255 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4256 the loop. The increment amount across iterations is expected
4257 to be vector_size.
4258 BSI - location where the new update stmt is to be placed.
4259 STMT - the original scalar memory-access stmt that is being vectorized.
4260 BUMP - optional. The offset by which to bump the pointer. If not given,
4261 the offset is assumed to be vector_size.
4263 Output: Return NEW_DATAREF_PTR as illustrated above.
4267 tree
4268 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4269 gimple *stmt, tree bump)
4271 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4272 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4273 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4274 tree update = TYPE_SIZE_UNIT (vectype);
4275 gassign *incr_stmt;
4276 ssa_op_iter iter;
4277 use_operand_p use_p;
4278 tree new_dataref_ptr;
4280 if (bump)
4281 update = bump;
4283 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4284 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4285 else
4286 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4287 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4288 dataref_ptr, update);
4289 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4291 /* Copy the points-to information if it exists. */
4292 if (DR_PTR_INFO (dr))
4294 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4295 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4298 if (!ptr_incr)
4299 return new_dataref_ptr;
4301 /* Update the vector-pointer's cross-iteration increment. */
4302 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4304 tree use = USE_FROM_PTR (use_p);
4306 if (use == dataref_ptr)
4307 SET_USE (use_p, new_dataref_ptr);
4308 else
4309 gcc_assert (tree_int_cst_compare (use, update) == 0);
4312 return new_dataref_ptr;
4316 /* Function vect_create_destination_var.
4318 Create a new temporary of type VECTYPE. */
4320 tree
4321 vect_create_destination_var (tree scalar_dest, tree vectype)
4323 tree vec_dest;
4324 const char *name;
4325 char *new_name;
4326 tree type;
4327 enum vect_var_kind kind;
4329 kind = vectype
4330 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4331 ? vect_mask_var
4332 : vect_simple_var
4333 : vect_scalar_var;
4334 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4336 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4338 name = get_name (scalar_dest);
4339 if (name)
4340 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4341 else
4342 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4343 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4344 free (new_name);
4346 return vec_dest;
4349 /* Function vect_grouped_store_supported.
4351 Returns TRUE if interleave high and interleave low permutations
4352 are supported, and FALSE otherwise. */
4354 bool
4355 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4357 machine_mode mode = TYPE_MODE (vectype);
4359 /* vect_permute_store_chain requires the group size to be equal to 3 or
4360 be a power of two. */
4361 if (count != 3 && exact_log2 (count) == -1)
4363 if (dump_enabled_p ())
4364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4365 "the size of the group of accesses"
4366 " is not a power of 2 or not eqaul to 3\n");
4367 return false;
4370 /* Check that the permutation is supported. */
4371 if (VECTOR_MODE_P (mode))
4373 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4374 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4376 if (count == 3)
4378 unsigned int j0 = 0, j1 = 0, j2 = 0;
4379 unsigned int i, j;
4381 for (j = 0; j < 3; j++)
4383 int nelt0 = ((3 - j) * nelt) % 3;
4384 int nelt1 = ((3 - j) * nelt + 1) % 3;
4385 int nelt2 = ((3 - j) * nelt + 2) % 3;
4386 for (i = 0; i < nelt; i++)
4388 if (3 * i + nelt0 < nelt)
4389 sel[3 * i + nelt0] = j0++;
4390 if (3 * i + nelt1 < nelt)
4391 sel[3 * i + nelt1] = nelt + j1++;
4392 if (3 * i + nelt2 < nelt)
4393 sel[3 * i + nelt2] = 0;
4395 if (!can_vec_perm_p (mode, false, sel))
4397 if (dump_enabled_p ())
4398 dump_printf (MSG_MISSED_OPTIMIZATION,
4399 "permutaion op not supported by target.\n");
4400 return false;
4403 for (i = 0; i < nelt; i++)
4405 if (3 * i + nelt0 < nelt)
4406 sel[3 * i + nelt0] = 3 * i + nelt0;
4407 if (3 * i + nelt1 < nelt)
4408 sel[3 * i + nelt1] = 3 * i + nelt1;
4409 if (3 * i + nelt2 < nelt)
4410 sel[3 * i + nelt2] = nelt + j2++;
4412 if (!can_vec_perm_p (mode, false, sel))
4414 if (dump_enabled_p ())
4415 dump_printf (MSG_MISSED_OPTIMIZATION,
4416 "permutaion op not supported by target.\n");
4417 return false;
4420 return true;
4422 else
4424 /* If length is not equal to 3 then only power of 2 is supported. */
4425 gcc_assert (pow2p_hwi (count));
4427 for (i = 0; i < nelt / 2; i++)
4429 sel[i * 2] = i;
4430 sel[i * 2 + 1] = i + nelt;
4432 if (can_vec_perm_p (mode, false, sel))
4434 for (i = 0; i < nelt; i++)
4435 sel[i] += nelt / 2;
4436 if (can_vec_perm_p (mode, false, sel))
4437 return true;
4442 if (dump_enabled_p ())
4443 dump_printf (MSG_MISSED_OPTIMIZATION,
4444 "permutaion op not supported by target.\n");
4445 return false;
4449 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4450 type VECTYPE. */
4452 bool
4453 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4455 return vect_lanes_optab_supported_p ("vec_store_lanes",
4456 vec_store_lanes_optab,
4457 vectype, count);
4461 /* Function vect_permute_store_chain.
4463 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4464 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4465 the data correctly for the stores. Return the final references for stores
4466 in RESULT_CHAIN.
4468 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4469 The input is 4 vectors each containing 8 elements. We assign a number to
4470 each element, the input sequence is:
4472 1st vec: 0 1 2 3 4 5 6 7
4473 2nd vec: 8 9 10 11 12 13 14 15
4474 3rd vec: 16 17 18 19 20 21 22 23
4475 4th vec: 24 25 26 27 28 29 30 31
4477 The output sequence should be:
4479 1st vec: 0 8 16 24 1 9 17 25
4480 2nd vec: 2 10 18 26 3 11 19 27
4481 3rd vec: 4 12 20 28 5 13 21 30
4482 4th vec: 6 14 22 30 7 15 23 31
4484 i.e., we interleave the contents of the four vectors in their order.
4486 We use interleave_high/low instructions to create such output. The input of
4487 each interleave_high/low operation is two vectors:
4488 1st vec 2nd vec
4489 0 1 2 3 4 5 6 7
4490 the even elements of the result vector are obtained left-to-right from the
4491 high/low elements of the first vector. The odd elements of the result are
4492 obtained left-to-right from the high/low elements of the second vector.
4493 The output of interleave_high will be: 0 4 1 5
4494 and of interleave_low: 2 6 3 7
4497 The permutation is done in log LENGTH stages. In each stage interleave_high
4498 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4499 where the first argument is taken from the first half of DR_CHAIN and the
4500 second argument from it's second half.
4501 In our example,
4503 I1: interleave_high (1st vec, 3rd vec)
4504 I2: interleave_low (1st vec, 3rd vec)
4505 I3: interleave_high (2nd vec, 4th vec)
4506 I4: interleave_low (2nd vec, 4th vec)
4508 The output for the first stage is:
4510 I1: 0 16 1 17 2 18 3 19
4511 I2: 4 20 5 21 6 22 7 23
4512 I3: 8 24 9 25 10 26 11 27
4513 I4: 12 28 13 29 14 30 15 31
4515 The output of the second stage, i.e. the final result is:
4517 I1: 0 8 16 24 1 9 17 25
4518 I2: 2 10 18 26 3 11 19 27
4519 I3: 4 12 20 28 5 13 21 30
4520 I4: 6 14 22 30 7 15 23 31. */
4522 void
4523 vect_permute_store_chain (vec<tree> dr_chain,
4524 unsigned int length,
4525 gimple *stmt,
4526 gimple_stmt_iterator *gsi,
4527 vec<tree> *result_chain)
4529 tree vect1, vect2, high, low;
4530 gimple *perm_stmt;
4531 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4532 tree perm_mask_low, perm_mask_high;
4533 tree data_ref;
4534 tree perm3_mask_low, perm3_mask_high;
4535 unsigned int i, n, log_length = exact_log2 (length);
4536 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4537 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4539 result_chain->quick_grow (length);
4540 memcpy (result_chain->address (), dr_chain.address (),
4541 length * sizeof (tree));
4543 if (length == 3)
4545 unsigned int j0 = 0, j1 = 0, j2 = 0;
4547 for (j = 0; j < 3; j++)
4549 int nelt0 = ((3 - j) * nelt) % 3;
4550 int nelt1 = ((3 - j) * nelt + 1) % 3;
4551 int nelt2 = ((3 - j) * nelt + 2) % 3;
4553 for (i = 0; i < nelt; i++)
4555 if (3 * i + nelt0 < nelt)
4556 sel[3 * i + nelt0] = j0++;
4557 if (3 * i + nelt1 < nelt)
4558 sel[3 * i + nelt1] = nelt + j1++;
4559 if (3 * i + nelt2 < nelt)
4560 sel[3 * i + nelt2] = 0;
4562 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4564 for (i = 0; i < nelt; i++)
4566 if (3 * i + nelt0 < nelt)
4567 sel[3 * i + nelt0] = 3 * i + nelt0;
4568 if (3 * i + nelt1 < nelt)
4569 sel[3 * i + nelt1] = 3 * i + nelt1;
4570 if (3 * i + nelt2 < nelt)
4571 sel[3 * i + nelt2] = nelt + j2++;
4573 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4575 vect1 = dr_chain[0];
4576 vect2 = dr_chain[1];
4578 /* Create interleaving stmt:
4579 low = VEC_PERM_EXPR <vect1, vect2,
4580 {j, nelt, *, j + 1, nelt + j + 1, *,
4581 j + 2, nelt + j + 2, *, ...}> */
4582 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4583 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4584 vect2, perm3_mask_low);
4585 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4587 vect1 = data_ref;
4588 vect2 = dr_chain[2];
4589 /* Create interleaving stmt:
4590 low = VEC_PERM_EXPR <vect1, vect2,
4591 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4592 6, 7, nelt + j + 2, ...}> */
4593 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4594 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4595 vect2, perm3_mask_high);
4596 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4597 (*result_chain)[j] = data_ref;
4600 else
4602 /* If length is not equal to 3 then only power of 2 is supported. */
4603 gcc_assert (pow2p_hwi (length));
4605 for (i = 0, n = nelt / 2; i < n; i++)
4607 sel[i * 2] = i;
4608 sel[i * 2 + 1] = i + nelt;
4610 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4612 for (i = 0; i < nelt; i++)
4613 sel[i] += nelt / 2;
4614 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4616 for (i = 0, n = log_length; i < n; i++)
4618 for (j = 0; j < length/2; j++)
4620 vect1 = dr_chain[j];
4621 vect2 = dr_chain[j+length/2];
4623 /* Create interleaving stmt:
4624 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4625 ...}> */
4626 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4627 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4628 vect2, perm_mask_high);
4629 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4630 (*result_chain)[2*j] = high;
4632 /* Create interleaving stmt:
4633 low = VEC_PERM_EXPR <vect1, vect2,
4634 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4635 ...}> */
4636 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4637 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4638 vect2, perm_mask_low);
4639 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4640 (*result_chain)[2*j+1] = low;
4642 memcpy (dr_chain.address (), result_chain->address (),
4643 length * sizeof (tree));
4648 /* Function vect_setup_realignment
4650 This function is called when vectorizing an unaligned load using
4651 the dr_explicit_realign[_optimized] scheme.
4652 This function generates the following code at the loop prolog:
4654 p = initial_addr;
4655 x msq_init = *(floor(p)); # prolog load
4656 realignment_token = call target_builtin;
4657 loop:
4658 x msq = phi (msq_init, ---)
4660 The stmts marked with x are generated only for the case of
4661 dr_explicit_realign_optimized.
4663 The code above sets up a new (vector) pointer, pointing to the first
4664 location accessed by STMT, and a "floor-aligned" load using that pointer.
4665 It also generates code to compute the "realignment-token" (if the relevant
4666 target hook was defined), and creates a phi-node at the loop-header bb
4667 whose arguments are the result of the prolog-load (created by this
4668 function) and the result of a load that takes place in the loop (to be
4669 created by the caller to this function).
4671 For the case of dr_explicit_realign_optimized:
4672 The caller to this function uses the phi-result (msq) to create the
4673 realignment code inside the loop, and sets up the missing phi argument,
4674 as follows:
4675 loop:
4676 msq = phi (msq_init, lsq)
4677 lsq = *(floor(p')); # load in loop
4678 result = realign_load (msq, lsq, realignment_token);
4680 For the case of dr_explicit_realign:
4681 loop:
4682 msq = *(floor(p)); # load in loop
4683 p' = p + (VS-1);
4684 lsq = *(floor(p')); # load in loop
4685 result = realign_load (msq, lsq, realignment_token);
4687 Input:
4688 STMT - (scalar) load stmt to be vectorized. This load accesses
4689 a memory location that may be unaligned.
4690 BSI - place where new code is to be inserted.
4691 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4692 is used.
4694 Output:
4695 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4696 target hook, if defined.
4697 Return value - the result of the loop-header phi node. */
4699 tree
4700 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4701 tree *realignment_token,
4702 enum dr_alignment_support alignment_support_scheme,
4703 tree init_addr,
4704 struct loop **at_loop)
4706 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4707 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4708 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4709 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4710 struct loop *loop = NULL;
4711 edge pe = NULL;
4712 tree scalar_dest = gimple_assign_lhs (stmt);
4713 tree vec_dest;
4714 gimple *inc;
4715 tree ptr;
4716 tree data_ref;
4717 basic_block new_bb;
4718 tree msq_init = NULL_TREE;
4719 tree new_temp;
4720 gphi *phi_stmt;
4721 tree msq = NULL_TREE;
4722 gimple_seq stmts = NULL;
4723 bool inv_p;
4724 bool compute_in_loop = false;
4725 bool nested_in_vect_loop = false;
4726 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4727 struct loop *loop_for_initial_load = NULL;
4729 if (loop_vinfo)
4731 loop = LOOP_VINFO_LOOP (loop_vinfo);
4732 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4735 gcc_assert (alignment_support_scheme == dr_explicit_realign
4736 || alignment_support_scheme == dr_explicit_realign_optimized);
4738 /* We need to generate three things:
4739 1. the misalignment computation
4740 2. the extra vector load (for the optimized realignment scheme).
4741 3. the phi node for the two vectors from which the realignment is
4742 done (for the optimized realignment scheme). */
4744 /* 1. Determine where to generate the misalignment computation.
4746 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4747 calculation will be generated by this function, outside the loop (in the
4748 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4749 caller, inside the loop.
4751 Background: If the misalignment remains fixed throughout the iterations of
4752 the loop, then both realignment schemes are applicable, and also the
4753 misalignment computation can be done outside LOOP. This is because we are
4754 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4755 are a multiple of VS (the Vector Size), and therefore the misalignment in
4756 different vectorized LOOP iterations is always the same.
4757 The problem arises only if the memory access is in an inner-loop nested
4758 inside LOOP, which is now being vectorized using outer-loop vectorization.
4759 This is the only case when the misalignment of the memory access may not
4760 remain fixed throughout the iterations of the inner-loop (as explained in
4761 detail in vect_supportable_dr_alignment). In this case, not only is the
4762 optimized realignment scheme not applicable, but also the misalignment
4763 computation (and generation of the realignment token that is passed to
4764 REALIGN_LOAD) have to be done inside the loop.
4766 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4767 or not, which in turn determines if the misalignment is computed inside
4768 the inner-loop, or outside LOOP. */
4770 if (init_addr != NULL_TREE || !loop_vinfo)
4772 compute_in_loop = true;
4773 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4777 /* 2. Determine where to generate the extra vector load.
4779 For the optimized realignment scheme, instead of generating two vector
4780 loads in each iteration, we generate a single extra vector load in the
4781 preheader of the loop, and in each iteration reuse the result of the
4782 vector load from the previous iteration. In case the memory access is in
4783 an inner-loop nested inside LOOP, which is now being vectorized using
4784 outer-loop vectorization, we need to determine whether this initial vector
4785 load should be generated at the preheader of the inner-loop, or can be
4786 generated at the preheader of LOOP. If the memory access has no evolution
4787 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4788 to be generated inside LOOP (in the preheader of the inner-loop). */
4790 if (nested_in_vect_loop)
4792 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4793 bool invariant_in_outerloop =
4794 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4795 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4797 else
4798 loop_for_initial_load = loop;
4799 if (at_loop)
4800 *at_loop = loop_for_initial_load;
4802 if (loop_for_initial_load)
4803 pe = loop_preheader_edge (loop_for_initial_load);
4805 /* 3. For the case of the optimized realignment, create the first vector
4806 load at the loop preheader. */
4808 if (alignment_support_scheme == dr_explicit_realign_optimized)
4810 /* Create msq_init = *(floor(p1)) in the loop preheader */
4811 gassign *new_stmt;
4813 gcc_assert (!compute_in_loop);
4814 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4815 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4816 NULL_TREE, &init_addr, NULL, &inc,
4817 true, &inv_p);
4818 if (TREE_CODE (ptr) == SSA_NAME)
4819 new_temp = copy_ssa_name (ptr);
4820 else
4821 new_temp = make_ssa_name (TREE_TYPE (ptr));
4822 new_stmt = gimple_build_assign
4823 (new_temp, BIT_AND_EXPR, ptr,
4824 build_int_cst (TREE_TYPE (ptr),
4825 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4826 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4827 gcc_assert (!new_bb);
4828 data_ref
4829 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4830 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4831 new_stmt = gimple_build_assign (vec_dest, data_ref);
4832 new_temp = make_ssa_name (vec_dest, new_stmt);
4833 gimple_assign_set_lhs (new_stmt, new_temp);
4834 if (pe)
4836 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4837 gcc_assert (!new_bb);
4839 else
4840 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4842 msq_init = gimple_assign_lhs (new_stmt);
4845 /* 4. Create realignment token using a target builtin, if available.
4846 It is done either inside the containing loop, or before LOOP (as
4847 determined above). */
4849 if (targetm.vectorize.builtin_mask_for_load)
4851 gcall *new_stmt;
4852 tree builtin_decl;
4854 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4855 if (!init_addr)
4857 /* Generate the INIT_ADDR computation outside LOOP. */
4858 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4859 NULL_TREE);
4860 if (loop)
4862 pe = loop_preheader_edge (loop);
4863 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4864 gcc_assert (!new_bb);
4866 else
4867 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4870 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4871 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4872 vec_dest =
4873 vect_create_destination_var (scalar_dest,
4874 gimple_call_return_type (new_stmt));
4875 new_temp = make_ssa_name (vec_dest, new_stmt);
4876 gimple_call_set_lhs (new_stmt, new_temp);
4878 if (compute_in_loop)
4879 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4880 else
4882 /* Generate the misalignment computation outside LOOP. */
4883 pe = loop_preheader_edge (loop);
4884 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4885 gcc_assert (!new_bb);
4888 *realignment_token = gimple_call_lhs (new_stmt);
4890 /* The result of the CALL_EXPR to this builtin is determined from
4891 the value of the parameter and no global variables are touched
4892 which makes the builtin a "const" function. Requiring the
4893 builtin to have the "const" attribute makes it unnecessary
4894 to call mark_call_clobbered. */
4895 gcc_assert (TREE_READONLY (builtin_decl));
4898 if (alignment_support_scheme == dr_explicit_realign)
4899 return msq;
4901 gcc_assert (!compute_in_loop);
4902 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4905 /* 5. Create msq = phi <msq_init, lsq> in loop */
4907 pe = loop_preheader_edge (containing_loop);
4908 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4909 msq = make_ssa_name (vec_dest);
4910 phi_stmt = create_phi_node (msq, containing_loop->header);
4911 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4913 return msq;
4917 /* Function vect_grouped_load_supported.
4919 COUNT is the size of the load group (the number of statements plus the
4920 number of gaps). SINGLE_ELEMENT_P is true if there is actually
4921 only one statement, with a gap of COUNT - 1.
4923 Returns true if a suitable permute exists. */
4925 bool
4926 vect_grouped_load_supported (tree vectype, bool single_element_p,
4927 unsigned HOST_WIDE_INT count)
4929 machine_mode mode = TYPE_MODE (vectype);
4931 /* If this is single-element interleaving with an element distance
4932 that leaves unused vector loads around punt - we at least create
4933 very sub-optimal code in that case (and blow up memory,
4934 see PR65518). */
4935 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
4937 if (dump_enabled_p ())
4938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4939 "single-element interleaving not supported "
4940 "for not adjacent vector loads\n");
4941 return false;
4944 /* vect_permute_load_chain requires the group size to be equal to 3 or
4945 be a power of two. */
4946 if (count != 3 && exact_log2 (count) == -1)
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4950 "the size of the group of accesses"
4951 " is not a power of 2 or not equal to 3\n");
4952 return false;
4955 /* Check that the permutation is supported. */
4956 if (VECTOR_MODE_P (mode))
4958 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4959 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4961 if (count == 3)
4963 unsigned int k;
4964 for (k = 0; k < 3; k++)
4966 for (i = 0; i < nelt; i++)
4967 if (3 * i + k < 2 * nelt)
4968 sel[i] = 3 * i + k;
4969 else
4970 sel[i] = 0;
4971 if (!can_vec_perm_p (mode, false, sel))
4973 if (dump_enabled_p ())
4974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4975 "shuffle of 3 loads is not supported by"
4976 " target\n");
4977 return false;
4979 for (i = 0, j = 0; i < nelt; i++)
4980 if (3 * i + k < 2 * nelt)
4981 sel[i] = i;
4982 else
4983 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4984 if (!can_vec_perm_p (mode, false, sel))
4986 if (dump_enabled_p ())
4987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4988 "shuffle of 3 loads is not supported by"
4989 " target\n");
4990 return false;
4993 return true;
4995 else
4997 /* If length is not equal to 3 then only power of 2 is supported. */
4998 gcc_assert (pow2p_hwi (count));
4999 for (i = 0; i < nelt; i++)
5000 sel[i] = i * 2;
5001 if (can_vec_perm_p (mode, false, sel))
5003 for (i = 0; i < nelt; i++)
5004 sel[i] = i * 2 + 1;
5005 if (can_vec_perm_p (mode, false, sel))
5006 return true;
5011 if (dump_enabled_p ())
5012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5013 "extract even/odd not supported by target\n");
5014 return false;
5017 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5018 type VECTYPE. */
5020 bool
5021 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5023 return vect_lanes_optab_supported_p ("vec_load_lanes",
5024 vec_load_lanes_optab,
5025 vectype, count);
5028 /* Function vect_permute_load_chain.
5030 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5031 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5032 the input data correctly. Return the final references for loads in
5033 RESULT_CHAIN.
5035 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5036 The input is 4 vectors each containing 8 elements. We assign a number to each
5037 element, the input sequence is:
5039 1st vec: 0 1 2 3 4 5 6 7
5040 2nd vec: 8 9 10 11 12 13 14 15
5041 3rd vec: 16 17 18 19 20 21 22 23
5042 4th vec: 24 25 26 27 28 29 30 31
5044 The output sequence should be:
5046 1st vec: 0 4 8 12 16 20 24 28
5047 2nd vec: 1 5 9 13 17 21 25 29
5048 3rd vec: 2 6 10 14 18 22 26 30
5049 4th vec: 3 7 11 15 19 23 27 31
5051 i.e., the first output vector should contain the first elements of each
5052 interleaving group, etc.
5054 We use extract_even/odd instructions to create such output. The input of
5055 each extract_even/odd operation is two vectors
5056 1st vec 2nd vec
5057 0 1 2 3 4 5 6 7
5059 and the output is the vector of extracted even/odd elements. The output of
5060 extract_even will be: 0 2 4 6
5061 and of extract_odd: 1 3 5 7
5064 The permutation is done in log LENGTH stages. In each stage extract_even
5065 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5066 their order. In our example,
5068 E1: extract_even (1st vec, 2nd vec)
5069 E2: extract_odd (1st vec, 2nd vec)
5070 E3: extract_even (3rd vec, 4th vec)
5071 E4: extract_odd (3rd vec, 4th vec)
5073 The output for the first stage will be:
5075 E1: 0 2 4 6 8 10 12 14
5076 E2: 1 3 5 7 9 11 13 15
5077 E3: 16 18 20 22 24 26 28 30
5078 E4: 17 19 21 23 25 27 29 31
5080 In order to proceed and create the correct sequence for the next stage (or
5081 for the correct output, if the second stage is the last one, as in our
5082 example), we first put the output of extract_even operation and then the
5083 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5084 The input for the second stage is:
5086 1st vec (E1): 0 2 4 6 8 10 12 14
5087 2nd vec (E3): 16 18 20 22 24 26 28 30
5088 3rd vec (E2): 1 3 5 7 9 11 13 15
5089 4th vec (E4): 17 19 21 23 25 27 29 31
5091 The output of the second stage:
5093 E1: 0 4 8 12 16 20 24 28
5094 E2: 2 6 10 14 18 22 26 30
5095 E3: 1 5 9 13 17 21 25 29
5096 E4: 3 7 11 15 19 23 27 31
5098 And RESULT_CHAIN after reordering:
5100 1st vec (E1): 0 4 8 12 16 20 24 28
5101 2nd vec (E3): 1 5 9 13 17 21 25 29
5102 3rd vec (E2): 2 6 10 14 18 22 26 30
5103 4th vec (E4): 3 7 11 15 19 23 27 31. */
5105 static void
5106 vect_permute_load_chain (vec<tree> dr_chain,
5107 unsigned int length,
5108 gimple *stmt,
5109 gimple_stmt_iterator *gsi,
5110 vec<tree> *result_chain)
5112 tree data_ref, first_vect, second_vect;
5113 tree perm_mask_even, perm_mask_odd;
5114 tree perm3_mask_low, perm3_mask_high;
5115 gimple *perm_stmt;
5116 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5117 unsigned int i, j, log_length = exact_log2 (length);
5118 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5119 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5121 result_chain->quick_grow (length);
5122 memcpy (result_chain->address (), dr_chain.address (),
5123 length * sizeof (tree));
5125 if (length == 3)
5127 unsigned int k;
5129 for (k = 0; k < 3; k++)
5131 for (i = 0; i < nelt; i++)
5132 if (3 * i + k < 2 * nelt)
5133 sel[i] = 3 * i + k;
5134 else
5135 sel[i] = 0;
5136 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5138 for (i = 0, j = 0; i < nelt; i++)
5139 if (3 * i + k < 2 * nelt)
5140 sel[i] = i;
5141 else
5142 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5144 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5146 first_vect = dr_chain[0];
5147 second_vect = dr_chain[1];
5149 /* Create interleaving stmt (low part of):
5150 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5151 ...}> */
5152 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5153 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5154 second_vect, perm3_mask_low);
5155 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5157 /* Create interleaving stmt (high part of):
5158 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5159 ...}> */
5160 first_vect = data_ref;
5161 second_vect = dr_chain[2];
5162 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5163 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5164 second_vect, perm3_mask_high);
5165 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5166 (*result_chain)[k] = data_ref;
5169 else
5171 /* If length is not equal to 3 then only power of 2 is supported. */
5172 gcc_assert (pow2p_hwi (length));
5174 for (i = 0; i < nelt; ++i)
5175 sel[i] = i * 2;
5176 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5178 for (i = 0; i < nelt; ++i)
5179 sel[i] = i * 2 + 1;
5180 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5182 for (i = 0; i < log_length; i++)
5184 for (j = 0; j < length; j += 2)
5186 first_vect = dr_chain[j];
5187 second_vect = dr_chain[j+1];
5189 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5190 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5191 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5192 first_vect, second_vect,
5193 perm_mask_even);
5194 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5195 (*result_chain)[j/2] = data_ref;
5197 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5198 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5199 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5200 first_vect, second_vect,
5201 perm_mask_odd);
5202 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5203 (*result_chain)[j/2+length/2] = data_ref;
5205 memcpy (dr_chain.address (), result_chain->address (),
5206 length * sizeof (tree));
5211 /* Function vect_shift_permute_load_chain.
5213 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5214 sequence of stmts to reorder the input data accordingly.
5215 Return the final references for loads in RESULT_CHAIN.
5216 Return true if successed, false otherwise.
5218 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5219 The input is 3 vectors each containing 8 elements. We assign a
5220 number to each element, the input sequence is:
5222 1st vec: 0 1 2 3 4 5 6 7
5223 2nd vec: 8 9 10 11 12 13 14 15
5224 3rd vec: 16 17 18 19 20 21 22 23
5226 The output sequence should be:
5228 1st vec: 0 3 6 9 12 15 18 21
5229 2nd vec: 1 4 7 10 13 16 19 22
5230 3rd vec: 2 5 8 11 14 17 20 23
5232 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5234 First we shuffle all 3 vectors to get correct elements order:
5236 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5237 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5238 3rd vec: (16 19 22) (17 20 23) (18 21)
5240 Next we unite and shift vector 3 times:
5242 1st step:
5243 shift right by 6 the concatenation of:
5244 "1st vec" and "2nd vec"
5245 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5246 "2nd vec" and "3rd vec"
5247 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5248 "3rd vec" and "1st vec"
5249 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5250 | New vectors |
5252 So that now new vectors are:
5254 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5255 2nd vec: (10 13) (16 19 22) (17 20 23)
5256 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5258 2nd step:
5259 shift right by 5 the concatenation of:
5260 "1st vec" and "3rd vec"
5261 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5262 "2nd vec" and "1st vec"
5263 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5264 "3rd vec" and "2nd vec"
5265 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5266 | New vectors |
5268 So that now new vectors are:
5270 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5271 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5272 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5274 3rd step:
5275 shift right by 5 the concatenation of:
5276 "1st vec" and "1st vec"
5277 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5278 shift right by 3 the concatenation of:
5279 "2nd vec" and "2nd vec"
5280 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5281 | New vectors |
5283 So that now all vectors are READY:
5284 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5285 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5286 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5288 This algorithm is faster than one in vect_permute_load_chain if:
5289 1. "shift of a concatination" is faster than general permutation.
5290 This is usually so.
5291 2. The TARGET machine can't execute vector instructions in parallel.
5292 This is because each step of the algorithm depends on previous.
5293 The algorithm in vect_permute_load_chain is much more parallel.
5295 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5298 static bool
5299 vect_shift_permute_load_chain (vec<tree> dr_chain,
5300 unsigned int length,
5301 gimple *stmt,
5302 gimple_stmt_iterator *gsi,
5303 vec<tree> *result_chain)
5305 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5306 tree perm2_mask1, perm2_mask2, perm3_mask;
5307 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5308 gimple *perm_stmt;
5310 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5311 unsigned int i;
5312 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5313 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5314 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5315 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5317 result_chain->quick_grow (length);
5318 memcpy (result_chain->address (), dr_chain.address (),
5319 length * sizeof (tree));
5321 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5323 unsigned int j, log_length = exact_log2 (length);
5324 for (i = 0; i < nelt / 2; ++i)
5325 sel[i] = i * 2;
5326 for (i = 0; i < nelt / 2; ++i)
5327 sel[nelt / 2 + i] = i * 2 + 1;
5328 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5330 if (dump_enabled_p ())
5331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5332 "shuffle of 2 fields structure is not \
5333 supported by target\n");
5334 return false;
5336 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5338 for (i = 0; i < nelt / 2; ++i)
5339 sel[i] = i * 2 + 1;
5340 for (i = 0; i < nelt / 2; ++i)
5341 sel[nelt / 2 + i] = i * 2;
5342 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5344 if (dump_enabled_p ())
5345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5346 "shuffle of 2 fields structure is not \
5347 supported by target\n");
5348 return false;
5350 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5352 /* Generating permutation constant to shift all elements.
5353 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5354 for (i = 0; i < nelt; i++)
5355 sel[i] = nelt / 2 + i;
5356 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5358 if (dump_enabled_p ())
5359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5360 "shift permutation is not supported by target\n");
5361 return false;
5363 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5365 /* Generating permutation constant to select vector from 2.
5366 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5367 for (i = 0; i < nelt / 2; i++)
5368 sel[i] = i;
5369 for (i = nelt / 2; i < nelt; i++)
5370 sel[i] = nelt + i;
5371 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5373 if (dump_enabled_p ())
5374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5375 "select is not supported by target\n");
5376 return false;
5378 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5380 for (i = 0; i < log_length; i++)
5382 for (j = 0; j < length; j += 2)
5384 first_vect = dr_chain[j];
5385 second_vect = dr_chain[j + 1];
5387 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5388 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5389 first_vect, first_vect,
5390 perm2_mask1);
5391 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5392 vect[0] = data_ref;
5394 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5395 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5396 second_vect, second_vect,
5397 perm2_mask2);
5398 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5399 vect[1] = data_ref;
5401 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5402 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5403 vect[0], vect[1], shift1_mask);
5404 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5405 (*result_chain)[j/2 + length/2] = data_ref;
5407 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5408 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5409 vect[0], vect[1], select_mask);
5410 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5411 (*result_chain)[j/2] = data_ref;
5413 memcpy (dr_chain.address (), result_chain->address (),
5414 length * sizeof (tree));
5416 return true;
5418 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5420 unsigned int k = 0, l = 0;
5422 /* Generating permutation constant to get all elements in rigth order.
5423 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5424 for (i = 0; i < nelt; i++)
5426 if (3 * k + (l % 3) >= nelt)
5428 k = 0;
5429 l += (3 - (nelt % 3));
5431 sel[i] = 3 * k + (l % 3);
5432 k++;
5434 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5436 if (dump_enabled_p ())
5437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5438 "shuffle of 3 fields structure is not \
5439 supported by target\n");
5440 return false;
5442 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5444 /* Generating permutation constant to shift all elements.
5445 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5446 for (i = 0; i < nelt; i++)
5447 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5448 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5450 if (dump_enabled_p ())
5451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5452 "shift permutation is not supported by target\n");
5453 return false;
5455 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5457 /* Generating permutation constant to shift all elements.
5458 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5459 for (i = 0; i < nelt; i++)
5460 sel[i] = 2 * (nelt / 3) + 1 + i;
5461 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5463 if (dump_enabled_p ())
5464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5465 "shift permutation is not supported by target\n");
5466 return false;
5468 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5470 /* Generating permutation constant to shift all elements.
5471 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5472 for (i = 0; i < nelt; i++)
5473 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5474 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5476 if (dump_enabled_p ())
5477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5478 "shift permutation is not supported by target\n");
5479 return false;
5481 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5483 /* Generating permutation constant to shift all elements.
5484 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5485 for (i = 0; i < nelt; i++)
5486 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5487 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5489 if (dump_enabled_p ())
5490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5491 "shift permutation is not supported by target\n");
5492 return false;
5494 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5496 for (k = 0; k < 3; k++)
5498 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5499 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5500 dr_chain[k], dr_chain[k],
5501 perm3_mask);
5502 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5503 vect[k] = data_ref;
5506 for (k = 0; k < 3; k++)
5508 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5509 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5510 vect[k % 3], vect[(k + 1) % 3],
5511 shift1_mask);
5512 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5513 vect_shift[k] = data_ref;
5516 for (k = 0; k < 3; k++)
5518 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5519 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5520 vect_shift[(4 - k) % 3],
5521 vect_shift[(3 - k) % 3],
5522 shift2_mask);
5523 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5524 vect[k] = data_ref;
5527 (*result_chain)[3 - (nelt % 3)] = vect[2];
5529 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5530 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5531 vect[0], shift3_mask);
5532 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5533 (*result_chain)[nelt % 3] = data_ref;
5535 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5536 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5537 vect[1], shift4_mask);
5538 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5539 (*result_chain)[0] = data_ref;
5540 return true;
5542 return false;
5545 /* Function vect_transform_grouped_load.
5547 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5548 to perform their permutation and ascribe the result vectorized statements to
5549 the scalar statements.
5552 void
5553 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5554 gimple_stmt_iterator *gsi)
5556 machine_mode mode;
5557 vec<tree> result_chain = vNULL;
5559 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5560 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5561 vectors, that are ready for vector computation. */
5562 result_chain.create (size);
5564 /* If reassociation width for vector type is 2 or greater target machine can
5565 execute 2 or more vector instructions in parallel. Otherwise try to
5566 get chain for loads group using vect_shift_permute_load_chain. */
5567 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5568 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5569 || pow2p_hwi (size)
5570 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5571 gsi, &result_chain))
5572 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5573 vect_record_grouped_load_vectors (stmt, result_chain);
5574 result_chain.release ();
5577 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5578 generated as part of the vectorization of STMT. Assign the statement
5579 for each vector to the associated scalar statement. */
5581 void
5582 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5584 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5585 gimple *next_stmt, *new_stmt;
5586 unsigned int i, gap_count;
5587 tree tmp_data_ref;
5589 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5590 Since we scan the chain starting from it's first node, their order
5591 corresponds the order of data-refs in RESULT_CHAIN. */
5592 next_stmt = first_stmt;
5593 gap_count = 1;
5594 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5596 if (!next_stmt)
5597 break;
5599 /* Skip the gaps. Loads created for the gaps will be removed by dead
5600 code elimination pass later. No need to check for the first stmt in
5601 the group, since it always exists.
5602 GROUP_GAP is the number of steps in elements from the previous
5603 access (if there is no gap GROUP_GAP is 1). We skip loads that
5604 correspond to the gaps. */
5605 if (next_stmt != first_stmt
5606 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5608 gap_count++;
5609 continue;
5612 while (next_stmt)
5614 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5615 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5616 copies, and we put the new vector statement in the first available
5617 RELATED_STMT. */
5618 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5619 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5620 else
5622 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5624 gimple *prev_stmt =
5625 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5626 gimple *rel_stmt =
5627 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5628 while (rel_stmt)
5630 prev_stmt = rel_stmt;
5631 rel_stmt =
5632 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5635 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5636 new_stmt;
5640 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5641 gap_count = 1;
5642 /* If NEXT_STMT accesses the same DR as the previous statement,
5643 put the same TMP_DATA_REF as its vectorized statement; otherwise
5644 get the next data-ref from RESULT_CHAIN. */
5645 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5646 break;
5651 /* Function vect_force_dr_alignment_p.
5653 Returns whether the alignment of a DECL can be forced to be aligned
5654 on ALIGNMENT bit boundary. */
5656 bool
5657 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5659 if (!VAR_P (decl))
5660 return false;
5662 if (decl_in_symtab_p (decl)
5663 && !symtab_node::get (decl)->can_increase_alignment_p ())
5664 return false;
5666 if (TREE_STATIC (decl))
5667 return (alignment <= MAX_OFILE_ALIGNMENT);
5668 else
5669 return (alignment <= MAX_STACK_ALIGNMENT);
5673 /* Return whether the data reference DR is supported with respect to its
5674 alignment.
5675 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5676 it is aligned, i.e., check if it is possible to vectorize it with different
5677 alignment. */
5679 enum dr_alignment_support
5680 vect_supportable_dr_alignment (struct data_reference *dr,
5681 bool check_aligned_accesses)
5683 gimple *stmt = DR_STMT (dr);
5684 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5685 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5686 machine_mode mode = TYPE_MODE (vectype);
5687 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5688 struct loop *vect_loop = NULL;
5689 bool nested_in_vect_loop = false;
5691 if (aligned_access_p (dr) && !check_aligned_accesses)
5692 return dr_aligned;
5694 /* For now assume all conditional loads/stores support unaligned
5695 access without any special code. */
5696 if (is_gimple_call (stmt)
5697 && gimple_call_internal_p (stmt)
5698 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5699 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5700 return dr_unaligned_supported;
5702 if (loop_vinfo)
5704 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5705 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5708 /* Possibly unaligned access. */
5710 /* We can choose between using the implicit realignment scheme (generating
5711 a misaligned_move stmt) and the explicit realignment scheme (generating
5712 aligned loads with a REALIGN_LOAD). There are two variants to the
5713 explicit realignment scheme: optimized, and unoptimized.
5714 We can optimize the realignment only if the step between consecutive
5715 vector loads is equal to the vector size. Since the vector memory
5716 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5717 is guaranteed that the misalignment amount remains the same throughout the
5718 execution of the vectorized loop. Therefore, we can create the
5719 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5720 at the loop preheader.
5722 However, in the case of outer-loop vectorization, when vectorizing a
5723 memory access in the inner-loop nested within the LOOP that is now being
5724 vectorized, while it is guaranteed that the misalignment of the
5725 vectorized memory access will remain the same in different outer-loop
5726 iterations, it is *not* guaranteed that is will remain the same throughout
5727 the execution of the inner-loop. This is because the inner-loop advances
5728 with the original scalar step (and not in steps of VS). If the inner-loop
5729 step happens to be a multiple of VS, then the misalignment remains fixed
5730 and we can use the optimized realignment scheme. For example:
5732 for (i=0; i<N; i++)
5733 for (j=0; j<M; j++)
5734 s += a[i+j];
5736 When vectorizing the i-loop in the above example, the step between
5737 consecutive vector loads is 1, and so the misalignment does not remain
5738 fixed across the execution of the inner-loop, and the realignment cannot
5739 be optimized (as illustrated in the following pseudo vectorized loop):
5741 for (i=0; i<N; i+=4)
5742 for (j=0; j<M; j++){
5743 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5744 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5745 // (assuming that we start from an aligned address).
5748 We therefore have to use the unoptimized realignment scheme:
5750 for (i=0; i<N; i+=4)
5751 for (j=k; j<M; j+=4)
5752 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5753 // that the misalignment of the initial address is
5754 // 0).
5756 The loop can then be vectorized as follows:
5758 for (k=0; k<4; k++){
5759 rt = get_realignment_token (&vp[k]);
5760 for (i=0; i<N; i+=4){
5761 v1 = vp[i+k];
5762 for (j=k; j<M; j+=4){
5763 v2 = vp[i+j+VS-1];
5764 va = REALIGN_LOAD <v1,v2,rt>;
5765 vs += va;
5766 v1 = v2;
5769 } */
5771 if (DR_IS_READ (dr))
5773 bool is_packed = false;
5774 tree type = (TREE_TYPE (DR_REF (dr)));
5776 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5777 && (!targetm.vectorize.builtin_mask_for_load
5778 || targetm.vectorize.builtin_mask_for_load ()))
5780 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5782 /* If we are doing SLP then the accesses need not have the
5783 same alignment, instead it depends on the SLP group size. */
5784 if (loop_vinfo
5785 && STMT_SLP_TYPE (stmt_info)
5786 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5787 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5788 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5790 else if (!loop_vinfo
5791 || (nested_in_vect_loop
5792 && (TREE_INT_CST_LOW (DR_STEP (dr))
5793 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5794 return dr_explicit_realign;
5795 else
5796 return dr_explicit_realign_optimized;
5798 if (!known_alignment_for_access_p (dr))
5799 is_packed = not_size_aligned (DR_REF (dr));
5801 if (targetm.vectorize.support_vector_misalignment
5802 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5803 /* Can't software pipeline the loads, but can at least do them. */
5804 return dr_unaligned_supported;
5806 else
5808 bool is_packed = false;
5809 tree type = (TREE_TYPE (DR_REF (dr)));
5811 if (!known_alignment_for_access_p (dr))
5812 is_packed = not_size_aligned (DR_REF (dr));
5814 if (targetm.vectorize.support_vector_misalignment
5815 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5816 return dr_unaligned_supported;
5819 /* Unsupported. */
5820 return dr_unaligned_unsupported;