libgo: add misc/cgo files
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob623acf695ed289c6af86ae5ae1938016378b2a09
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (!runtime_alias_check_p (ddr, loop,
154 optimize_loop_nest_for_speed_p (loop)))
155 return false;
157 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
158 return true;
162 /* Function vect_analyze_data_ref_dependence.
164 Return TRUE if there (might) exist a dependence between a memory-reference
165 DRA and a memory-reference DRB. When versioning for alias may check a
166 dependence at run-time, return FALSE. Adjust *MAX_VF according to
167 the data dependence. */
169 static bool
170 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
171 loop_vec_info loop_vinfo, int *max_vf)
173 unsigned int i;
174 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
175 struct data_reference *dra = DDR_A (ddr);
176 struct data_reference *drb = DDR_B (ddr);
177 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
178 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
179 lambda_vector dist_v;
180 unsigned int loop_depth;
182 /* In loop analysis all data references should be vectorizable. */
183 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
184 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
185 gcc_unreachable ();
187 /* Independent data accesses. */
188 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
189 return false;
191 if (dra == drb
192 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
193 return false;
195 /* We do not have to consider dependences between accesses that belong
196 to the same group. */
197 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
198 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
199 return false;
201 /* Even if we have an anti-dependence then, as the vectorized loop covers at
202 least two scalar iterations, there is always also a true dependence.
203 As the vectorizer does not re-order loads and stores we can ignore
204 the anti-dependence if TBAA can disambiguate both DRs similar to the
205 case with known negative distance anti-dependences (positive
206 distance anti-dependences would violate TBAA constraints). */
207 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
208 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
209 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
210 get_alias_set (DR_REF (drb))))
211 return false;
213 /* Unknown data dependence. */
214 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
216 /* If user asserted safelen consecutive iterations can be
217 executed concurrently, assume independence. */
218 if (loop->safelen >= 2)
220 if (loop->safelen < *max_vf)
221 *max_vf = loop->safelen;
222 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
223 return false;
226 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
227 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
229 if (dump_enabled_p ())
231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
232 "versioning for alias not supported for: "
233 "can't determine dependence between ");
234 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
235 DR_REF (dra));
236 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
237 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
238 DR_REF (drb));
239 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
241 return true;
244 if (dump_enabled_p ())
246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
247 "versioning for alias required: "
248 "can't determine dependence between ");
249 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
250 DR_REF (dra));
251 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
252 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
253 DR_REF (drb));
254 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
257 /* Add to list of ddrs that need to be tested at run-time. */
258 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
261 /* Known data dependence. */
262 if (DDR_NUM_DIST_VECTS (ddr) == 0)
264 /* If user asserted safelen consecutive iterations can be
265 executed concurrently, assume independence. */
266 if (loop->safelen >= 2)
268 if (loop->safelen < *max_vf)
269 *max_vf = loop->safelen;
270 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
271 return false;
274 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
275 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
277 if (dump_enabled_p ())
279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
280 "versioning for alias not supported for: "
281 "bad dist vector for ");
282 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
283 DR_REF (dra));
284 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (drb));
287 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
289 return true;
292 if (dump_enabled_p ())
294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
295 "versioning for alias required: "
296 "bad dist vector for ");
297 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
298 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
299 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
300 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
302 /* Add to list of ddrs that need to be tested at run-time. */
303 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
306 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
307 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
309 int dist = dist_v[loop_depth];
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location,
313 "dependence distance = %d.\n", dist);
315 if (dist == 0)
317 if (dump_enabled_p ())
319 dump_printf_loc (MSG_NOTE, vect_location,
320 "dependence distance == 0 between ");
321 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
322 dump_printf (MSG_NOTE, " and ");
323 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
324 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
327 /* When we perform grouped accesses and perform implicit CSE
328 by detecting equal accesses and doing disambiguation with
329 runtime alias tests like for
330 .. = a[i];
331 .. = a[i+1];
332 a[i] = ..;
333 a[i+1] = ..;
334 *p = ..;
335 .. = a[i];
336 .. = a[i+1];
337 where we will end up loading { a[i], a[i+1] } once, make
338 sure that inserting group loads before the first load and
339 stores after the last store will do the right thing.
340 Similar for groups like
341 a[i] = ...;
342 ... = a[i];
343 a[i+1] = ...;
344 where loads from the group interleave with the store. */
345 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
346 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
348 gimple *earlier_stmt;
349 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
350 if (DR_IS_WRITE
351 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
353 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "READ_WRITE dependence in interleaving."
356 "\n");
357 return true;
361 continue;
364 if (dist > 0 && DDR_REVERSED_P (ddr))
366 /* If DDR_REVERSED_P the order of the data-refs in DDR was
367 reversed (to make distance vector positive), and the actual
368 distance is negative. */
369 if (dump_enabled_p ())
370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
371 "dependence distance negative.\n");
372 /* Record a negative dependence distance to later limit the
373 amount of stmt copying / unrolling we can perform.
374 Only need to handle read-after-write dependence. */
375 if (DR_IS_READ (drb)
376 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
377 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
378 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
379 continue;
382 if (abs (dist) >= 2
383 && abs (dist) < *max_vf)
385 /* The dependence distance requires reduction of the maximal
386 vectorization factor. */
387 *max_vf = abs (dist);
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_NOTE, vect_location,
390 "adjusting maximal vectorization factor to %i\n",
391 *max_vf);
394 if (abs (dist) >= *max_vf)
396 /* Dependence distance does not create dependence, as far as
397 vectorization is concerned, in this case. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_NOTE, vect_location,
400 "dependence distance >= VF.\n");
401 continue;
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "not vectorized, possible dependence "
408 "between data-refs ");
409 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
410 dump_printf (MSG_NOTE, " and ");
411 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
412 dump_printf (MSG_NOTE, "\n");
415 return true;
418 return false;
421 /* Function vect_analyze_data_ref_dependences.
423 Examine all the data references in the loop, and make sure there do not
424 exist any data dependences between them. Set *MAX_VF according to
425 the maximum vectorization factor the data dependences allow. */
427 bool
428 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
430 unsigned int i;
431 struct data_dependence_relation *ddr;
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_NOTE, vect_location,
435 "=== vect_analyze_data_ref_dependences ===\n");
437 LOOP_VINFO_DDRS (loop_vinfo)
438 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
439 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
440 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
441 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
442 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
443 &LOOP_VINFO_DDRS (loop_vinfo),
444 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
445 return false;
447 /* For epilogues we either have no aliases or alias versioning
448 was applied to original loop. Therefore we may just get max_vf
449 using VF of original loop. */
450 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
451 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
452 else
453 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
454 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
455 return false;
457 return true;
461 /* Function vect_slp_analyze_data_ref_dependence.
463 Return TRUE if there (might) exist a dependence between a memory-reference
464 DRA and a memory-reference DRB. When versioning for alias may check a
465 dependence at run-time, return FALSE. Adjust *MAX_VF according to
466 the data dependence. */
468 static bool
469 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
471 struct data_reference *dra = DDR_A (ddr);
472 struct data_reference *drb = DDR_B (ddr);
474 /* We need to check dependences of statements marked as unvectorizable
475 as well, they still can prohibit vectorization. */
477 /* Independent data accesses. */
478 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
479 return false;
481 if (dra == drb)
482 return false;
484 /* Read-read is OK. */
485 if (DR_IS_READ (dra) && DR_IS_READ (drb))
486 return false;
488 /* If dra and drb are part of the same interleaving chain consider
489 them independent. */
490 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
491 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
492 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
493 return false;
495 /* Unknown data dependence. */
496 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
498 if (dump_enabled_p ())
500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
501 "can't determine dependence between ");
502 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
503 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
504 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
505 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
508 else if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "determined dependence between ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
513 dump_printf (MSG_NOTE, " and ");
514 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
515 dump_printf (MSG_NOTE, "\n");
518 return true;
522 /* Analyze dependences involved in the transform of SLP NODE. STORES
523 contain the vector of scalar stores of this instance if we are
524 disambiguating the loads. */
526 static bool
527 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
528 vec<gimple *> stores, gimple *last_store)
530 /* This walks over all stmts involved in the SLP load/store done
531 in NODE verifying we can sink them up to the last stmt in the
532 group. */
533 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
534 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
536 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
537 if (access == last_access)
538 continue;
539 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
540 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
541 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
543 gimple *stmt = gsi_stmt (gsi);
544 if (! gimple_vuse (stmt)
545 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
546 continue;
548 /* If we couldn't record a (single) data reference for this
549 stmt we have to give up. */
550 /* ??? Here and below if dependence analysis fails we can resort
551 to the alias oracle which can handle more kinds of stmts. */
552 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
553 if (!dr_b)
554 return false;
556 bool dependent = false;
557 /* If we run into a store of this same instance (we've just
558 marked those) then delay dependence checking until we run
559 into the last store because this is where it will have
560 been sunk to (and we verify if we can do that as well). */
561 if (gimple_visited_p (stmt))
563 if (stmt != last_store)
564 continue;
565 unsigned i;
566 gimple *store;
567 FOR_EACH_VEC_ELT (stores, i, store)
569 data_reference *store_dr
570 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
571 ddr_p ddr = initialize_data_dependence_relation
572 (dr_a, store_dr, vNULL);
573 dependent = vect_slp_analyze_data_ref_dependence (ddr);
574 free_dependence_relation (ddr);
575 if (dependent)
576 break;
579 else
581 ddr_p ddr = initialize_data_dependence_relation (dr_a,
582 dr_b, vNULL);
583 dependent = vect_slp_analyze_data_ref_dependence (ddr);
584 free_dependence_relation (ddr);
586 if (dependent)
587 return false;
590 return true;
594 /* Function vect_analyze_data_ref_dependences.
596 Examine all the data references in the basic-block, and make sure there
597 do not exist any data dependences between them. Set *MAX_VF according to
598 the maximum vectorization factor the data dependences allow. */
600 bool
601 vect_slp_analyze_instance_dependence (slp_instance instance)
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_NOTE, vect_location,
605 "=== vect_slp_analyze_instance_dependence ===\n");
607 /* The stores of this instance are at the root of the SLP tree. */
608 slp_tree store = SLP_INSTANCE_TREE (instance);
609 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
610 store = NULL;
612 /* Verify we can sink stores to the vectorized stmt insert location. */
613 gimple *last_store = NULL;
614 if (store)
616 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
617 return false;
619 /* Mark stores in this instance and remember the last one. */
620 last_store = vect_find_last_scalar_stmt_in_slp (store);
621 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
622 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
625 bool res = true;
627 /* Verify we can sink loads to the vectorized stmt insert location,
628 special-casing stores of this instance. */
629 slp_tree load;
630 unsigned int i;
631 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
632 if (! vect_slp_analyze_node_dependences (instance, load,
633 store
634 ? SLP_TREE_SCALAR_STMTS (store)
635 : vNULL, last_store))
637 res = false;
638 break;
641 /* Unset the visited flag. */
642 if (store)
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
646 return res;
649 /* Function vect_compute_data_ref_alignment
651 Compute the misalignment of the data reference DR.
653 Output:
654 1. If during the misalignment computation it is found that the data reference
655 cannot be vectorized then false is returned.
656 2. DR_MISALIGNMENT (DR) is defined.
658 FOR NOW: No analysis is actually performed. Misalignment is calculated
659 only for trivial cases. TODO. */
661 bool
662 vect_compute_data_ref_alignment (struct data_reference *dr)
664 gimple *stmt = DR_STMT (dr);
665 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
666 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
667 struct loop *loop = NULL;
668 tree ref = DR_REF (dr);
669 tree vectype;
670 tree base, base_addr;
671 tree misalign = NULL_TREE;
672 tree aligned_to;
673 tree step;
674 unsigned HOST_WIDE_INT alignment;
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_NOTE, vect_location,
678 "vect_compute_data_ref_alignment:\n");
680 if (loop_vinfo)
681 loop = LOOP_VINFO_LOOP (loop_vinfo);
683 /* Initialize misalignment to unknown. */
684 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
686 if (tree_fits_shwi_p (DR_STEP (dr)))
687 misalign = DR_INIT (dr);
688 aligned_to = DR_ALIGNED_TO (dr);
689 base_addr = DR_BASE_ADDRESS (dr);
690 vectype = STMT_VINFO_VECTYPE (stmt_info);
692 /* In case the dataref is in an inner-loop of the loop that is being
693 vectorized (LOOP), we use the base and misalignment information
694 relative to the outer-loop (LOOP). This is ok only if the misalignment
695 stays the same throughout the execution of the inner-loop, which is why
696 we have to check that the stride of the dataref in the inner-loop evenly
697 divides by the vector size. */
698 if (loop && nested_in_vect_loop_p (loop, stmt))
700 tree step = DR_STEP (dr);
702 if (tree_fits_shwi_p (step)
703 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_NOTE, vect_location,
707 "inner step divides the vector-size.\n");
708 misalign = STMT_VINFO_DR_INIT (stmt_info);
709 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
710 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
712 else
714 if (dump_enabled_p ())
715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
716 "inner step doesn't divide the vector-size.\n");
717 misalign = NULL_TREE;
721 /* Similarly we can only use base and misalignment information relative to
722 an innermost loop if the misalignment stays the same throughout the
723 execution of the loop. As above, this is the case if the stride of
724 the dataref evenly divides by the vector size. */
725 else
727 tree step = DR_STEP (dr);
728 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
730 if (tree_fits_shwi_p (step)
731 && ((tree_to_shwi (step) * vf)
732 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
736 "step doesn't divide the vector-size.\n");
737 misalign = NULL_TREE;
741 /* To look at alignment of the base we have to preserve an inner MEM_REF
742 as that carries alignment information of the actual access. */
743 base = ref;
744 while (handled_component_p (base))
745 base = TREE_OPERAND (base, 0);
746 unsigned int base_alignment = 0;
747 unsigned HOST_WIDE_INT base_bitpos;
748 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
749 /* As data-ref analysis strips the MEM_REF down to its base operand
750 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
751 adjust things to make base_alignment valid as the alignment of
752 DR_BASE_ADDRESS. */
753 if (TREE_CODE (base) == MEM_REF)
755 /* Note all this only works if DR_BASE_ADDRESS is the same as
756 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
757 in other offsets. We need to rework DR to compute the alingment
758 of DR_BASE_ADDRESS as long as all information is still available. */
759 if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0))
761 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
762 base_bitpos &= (base_alignment - 1);
764 else
765 base_bitpos = BITS_PER_UNIT;
767 if (base_bitpos != 0)
768 base_alignment = base_bitpos & -base_bitpos;
769 /* Also look at the alignment of the base address DR analysis
770 computed. */
771 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
772 if (base_addr_alignment > base_alignment)
773 base_alignment = base_addr_alignment;
775 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
776 DR_VECT_AUX (dr)->base_element_aligned = true;
778 alignment = TYPE_ALIGN_UNIT (vectype);
780 if ((compare_tree_int (aligned_to, alignment) < 0)
781 || !misalign)
783 if (dump_enabled_p ())
785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
786 "Unknown alignment for access: ");
787 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
788 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
790 return true;
793 if (base_alignment < TYPE_ALIGN (vectype))
795 base = base_addr;
796 if (TREE_CODE (base) == ADDR_EXPR)
797 base = TREE_OPERAND (base, 0);
798 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
805 dump_printf (MSG_NOTE, "\n");
807 return true;
810 if (DECL_USER_ALIGN (base))
812 if (dump_enabled_p ())
814 dump_printf_loc (MSG_NOTE, vect_location,
815 "not forcing alignment of user-aligned "
816 "variable: ");
817 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
818 dump_printf (MSG_NOTE, "\n");
820 return true;
823 /* Force the alignment of the decl.
824 NOTE: This is the only change to the code we make during
825 the analysis phase, before deciding to vectorize the loop. */
826 if (dump_enabled_p ())
828 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
829 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
830 dump_printf (MSG_NOTE, "\n");
833 DR_VECT_AUX (dr)->base_decl = base;
834 DR_VECT_AUX (dr)->base_misaligned = true;
835 DR_VECT_AUX (dr)->base_element_aligned = true;
838 if (loop && nested_in_vect_loop_p (loop, stmt))
839 step = STMT_VINFO_DR_STEP (stmt_info);
840 else
841 step = DR_STEP (dr);
842 /* If this is a backward running DR then first access in the larger
843 vectype actually is N-1 elements before the address in the DR.
844 Adjust misalign accordingly. */
845 if (tree_int_cst_sgn (step) < 0)
847 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
848 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
849 otherwise we wouldn't be here. */
850 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
851 /* PLUS because STEP was negative. */
852 misalign = size_binop (PLUS_EXPR, misalign, offset);
855 SET_DR_MISALIGNMENT (dr,
856 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
858 if (dump_enabled_p ())
860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
861 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
862 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
863 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
866 return true;
870 /* Function vect_update_misalignment_for_peel.
871 Sets DR's misalignment
872 - to 0 if it has the same alignment as DR_PEEL,
873 - to the misalignment computed using NPEEL if DR's salignment is known,
874 - to -1 (unknown) otherwise.
876 DR - the data reference whose misalignment is to be adjusted.
877 DR_PEEL - the data reference whose misalignment is being made
878 zero in the vector loop by the peel.
879 NPEEL - the number of iterations in the peel loop if the misalignment
880 of DR_PEEL is known at compile time. */
882 static void
883 vect_update_misalignment_for_peel (struct data_reference *dr,
884 struct data_reference *dr_peel, int npeel)
886 unsigned int i;
887 vec<dr_p> same_aligned_drs;
888 struct data_reference *current_dr;
889 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
890 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
891 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
892 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
894 /* For interleaved data accesses the step in the loop must be multiplied by
895 the size of the interleaving group. */
896 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
897 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
898 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
899 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
901 /* It can be assumed that the data refs with the same alignment as dr_peel
902 are aligned in the vector loop. */
903 same_aligned_drs
904 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
905 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
907 if (current_dr != dr)
908 continue;
909 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
910 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
911 SET_DR_MISALIGNMENT (dr, 0);
912 return;
915 if (known_alignment_for_access_p (dr)
916 && known_alignment_for_access_p (dr_peel))
918 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
919 int misal = DR_MISALIGNMENT (dr);
920 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
921 misal += negative ? -npeel * dr_size : npeel * dr_size;
922 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
923 SET_DR_MISALIGNMENT (dr, misal);
924 return;
927 if (dump_enabled_p ())
928 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
929 "to unknown (-1).\n");
930 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
934 /* Function verify_data_ref_alignment
936 Return TRUE if DR can be handled with respect to alignment. */
938 static bool
939 verify_data_ref_alignment (data_reference_p dr)
941 enum dr_alignment_support supportable_dr_alignment
942 = vect_supportable_dr_alignment (dr, false);
943 if (!supportable_dr_alignment)
945 if (dump_enabled_p ())
947 if (DR_IS_READ (dr))
948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
949 "not vectorized: unsupported unaligned load.");
950 else
951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
952 "not vectorized: unsupported unaligned "
953 "store.");
955 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
956 DR_REF (dr));
957 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
959 return false;
962 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
963 dump_printf_loc (MSG_NOTE, vect_location,
964 "Vectorizing an unaligned access.\n");
966 return true;
969 /* Function vect_verify_datarefs_alignment
971 Return TRUE if all data references in the loop can be
972 handled with respect to alignment. */
974 bool
975 vect_verify_datarefs_alignment (loop_vec_info vinfo)
977 vec<data_reference_p> datarefs = vinfo->datarefs;
978 struct data_reference *dr;
979 unsigned int i;
981 FOR_EACH_VEC_ELT (datarefs, i, dr)
983 gimple *stmt = DR_STMT (dr);
984 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
986 if (!STMT_VINFO_RELEVANT_P (stmt_info))
987 continue;
989 /* For interleaving, only the alignment of the first access matters. */
990 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
991 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
992 continue;
994 /* Strided accesses perform only component accesses, alignment is
995 irrelevant for them. */
996 if (STMT_VINFO_STRIDED_P (stmt_info)
997 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
998 continue;
1000 if (! verify_data_ref_alignment (dr))
1001 return false;
1004 return true;
1007 /* Given an memory reference EXP return whether its alignment is less
1008 than its size. */
1010 static bool
1011 not_size_aligned (tree exp)
1013 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1014 return true;
1016 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1017 > get_object_alignment (exp));
1020 /* Function vector_alignment_reachable_p
1022 Return true if vector alignment for DR is reachable by peeling
1023 a few loop iterations. Return false otherwise. */
1025 static bool
1026 vector_alignment_reachable_p (struct data_reference *dr)
1028 gimple *stmt = DR_STMT (dr);
1029 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1030 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1032 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1034 /* For interleaved access we peel only if number of iterations in
1035 the prolog loop ({VF - misalignment}), is a multiple of the
1036 number of the interleaved accesses. */
1037 int elem_size, mis_in_elements;
1038 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1040 /* FORNOW: handle only known alignment. */
1041 if (!known_alignment_for_access_p (dr))
1042 return false;
1044 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1045 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1047 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1048 return false;
1051 /* If misalignment is known at the compile time then allow peeling
1052 only if natural alignment is reachable through peeling. */
1053 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1055 HOST_WIDE_INT elmsize =
1056 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1057 if (dump_enabled_p ())
1059 dump_printf_loc (MSG_NOTE, vect_location,
1060 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1061 dump_printf (MSG_NOTE,
1062 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1064 if (DR_MISALIGNMENT (dr) % elmsize)
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "data size does not divide the misalignment.\n");
1069 return false;
1073 if (!known_alignment_for_access_p (dr))
1075 tree type = TREE_TYPE (DR_REF (dr));
1076 bool is_packed = not_size_aligned (DR_REF (dr));
1077 if (dump_enabled_p ())
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079 "Unknown misalignment, %snaturally aligned\n",
1080 is_packed ? "not " : "");
1081 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1084 return true;
1088 /* Calculate the cost of the memory access represented by DR. */
1090 static void
1091 vect_get_data_access_cost (struct data_reference *dr,
1092 unsigned int *inside_cost,
1093 unsigned int *outside_cost,
1094 stmt_vector_for_cost *body_cost_vec)
1096 gimple *stmt = DR_STMT (dr);
1097 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1098 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1099 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1100 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1101 int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly */
1103 if (DR_IS_READ (dr))
1104 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1105 NULL, body_cost_vec, false);
1106 else
1107 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1109 if (dump_enabled_p ())
1110 dump_printf_loc (MSG_NOTE, vect_location,
1111 "vect_get_data_access_cost: inside_cost = %d, "
1112 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1116 typedef struct _vect_peel_info
1118 struct data_reference *dr;
1119 int npeel;
1120 unsigned int count;
1121 } *vect_peel_info;
1123 typedef struct _vect_peel_extended_info
1125 struct _vect_peel_info peel_info;
1126 unsigned int inside_cost;
1127 unsigned int outside_cost;
1128 stmt_vector_for_cost body_cost_vec;
1129 } *vect_peel_extended_info;
1132 /* Peeling hashtable helpers. */
1134 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1136 static inline hashval_t hash (const _vect_peel_info *);
1137 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1140 inline hashval_t
1141 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1143 return (hashval_t) peel_info->npeel;
1146 inline bool
1147 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1149 return (a->npeel == b->npeel);
1153 /* Insert DR into peeling hash table with NPEEL as key. */
1155 static void
1156 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1157 loop_vec_info loop_vinfo, struct data_reference *dr,
1158 int npeel)
1160 struct _vect_peel_info elem, *slot;
1161 _vect_peel_info **new_slot;
1162 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1164 elem.npeel = npeel;
1165 slot = peeling_htab->find (&elem);
1166 if (slot)
1167 slot->count++;
1168 else
1170 slot = XNEW (struct _vect_peel_info);
1171 slot->npeel = npeel;
1172 slot->dr = dr;
1173 slot->count = 1;
1174 new_slot = peeling_htab->find_slot (slot, INSERT);
1175 *new_slot = slot;
1178 if (!supportable_dr_alignment
1179 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1180 slot->count += VECT_MAX_COST;
1184 /* Traverse peeling hash table to find peeling option that aligns maximum
1185 number of data accesses. */
1188 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1189 _vect_peel_extended_info *max)
1191 vect_peel_info elem = *slot;
1193 if (elem->count > max->peel_info.count
1194 || (elem->count == max->peel_info.count
1195 && max->peel_info.npeel > elem->npeel))
1197 max->peel_info.npeel = elem->npeel;
1198 max->peel_info.count = elem->count;
1199 max->peel_info.dr = elem->dr;
1202 return 1;
1205 /* Get the costs of peeling NPEEL iterations checking data access costs
1206 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1207 misalignment will be zero after peeling. */
1209 static void
1210 vect_get_peeling_costs_all_drs (struct data_reference *dr0,
1211 unsigned int *inside_cost,
1212 unsigned int *outside_cost,
1213 stmt_vector_for_cost *body_cost_vec,
1214 unsigned int npeel,
1215 bool unknown_misalignment)
1217 gimple *stmt = DR_STMT (dr0);
1218 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1219 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1220 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1222 unsigned i;
1223 data_reference *dr;
1225 FOR_EACH_VEC_ELT (datarefs, i, dr)
1227 stmt = DR_STMT (dr);
1228 stmt_info = vinfo_for_stmt (stmt);
1229 /* For interleaving, only the alignment of the first access
1230 matters. */
1231 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1232 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1233 continue;
1235 /* Strided accesses perform only component accesses, alignment is
1236 irrelevant for them. */
1237 if (STMT_VINFO_STRIDED_P (stmt_info)
1238 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1239 continue;
1241 int save_misalignment;
1242 save_misalignment = DR_MISALIGNMENT (dr);
1243 if (unknown_misalignment && dr == dr0)
1244 SET_DR_MISALIGNMENT (dr, 0);
1245 else
1246 vect_update_misalignment_for_peel (dr, dr0, npeel);
1247 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1248 body_cost_vec);
1249 SET_DR_MISALIGNMENT (dr, save_misalignment);
1253 /* Traverse peeling hash table and calculate cost for each peeling option.
1254 Find the one with the lowest cost. */
1257 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1258 _vect_peel_extended_info *min)
1260 vect_peel_info elem = *slot;
1261 int dummy;
1262 unsigned int inside_cost = 0, outside_cost = 0;
1263 gimple *stmt = DR_STMT (elem->dr);
1264 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1265 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1266 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1267 epilogue_cost_vec;
1269 prologue_cost_vec.create (2);
1270 body_cost_vec.create (2);
1271 epilogue_cost_vec.create (2);
1273 vect_get_peeling_costs_all_drs (elem->dr, &inside_cost, &outside_cost,
1274 &body_cost_vec, elem->npeel, false);
1276 outside_cost += vect_get_known_peeling_cost
1277 (loop_vinfo, elem->npeel, &dummy,
1278 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1279 &prologue_cost_vec, &epilogue_cost_vec);
1281 /* Prologue and epilogue costs are added to the target model later.
1282 These costs depend only on the scalar iteration cost, the
1283 number of peeling iterations finally chosen, and the number of
1284 misaligned statements. So discard the information found here. */
1285 prologue_cost_vec.release ();
1286 epilogue_cost_vec.release ();
1288 if (inside_cost < min->inside_cost
1289 || (inside_cost == min->inside_cost
1290 && outside_cost < min->outside_cost))
1292 min->inside_cost = inside_cost;
1293 min->outside_cost = outside_cost;
1294 min->body_cost_vec.release ();
1295 min->body_cost_vec = body_cost_vec;
1296 min->peel_info.dr = elem->dr;
1297 min->peel_info.npeel = elem->npeel;
1298 min->peel_info.count = elem->count;
1300 else
1301 body_cost_vec.release ();
1303 return 1;
1307 /* Choose best peeling option by traversing peeling hash table and either
1308 choosing an option with the lowest cost (if cost model is enabled) or the
1309 option that aligns as many accesses as possible. */
1311 static struct _vect_peel_extended_info
1312 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1313 loop_vec_info loop_vinfo,
1314 unsigned int *npeel,
1315 stmt_vector_for_cost *body_cost_vec)
1317 struct _vect_peel_extended_info res;
1319 res.peel_info.dr = NULL;
1320 res.body_cost_vec = stmt_vector_for_cost ();
1322 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1324 res.inside_cost = INT_MAX;
1325 res.outside_cost = INT_MAX;
1326 peeling_htab->traverse <_vect_peel_extended_info *,
1327 vect_peeling_hash_get_lowest_cost> (&res);
1329 else
1331 res.peel_info.count = 0;
1332 peeling_htab->traverse <_vect_peel_extended_info *,
1333 vect_peeling_hash_get_most_frequent> (&res);
1334 res.inside_cost = 0;
1335 res.outside_cost = 0;
1338 *npeel = res.peel_info.npeel;
1339 *body_cost_vec = res.body_cost_vec;
1340 return res;
1343 /* Return true if the new peeling NPEEL is supported. */
1345 static bool
1346 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1347 unsigned npeel)
1349 unsigned i;
1350 struct data_reference *dr = NULL;
1351 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1352 gimple *stmt;
1353 stmt_vec_info stmt_info;
1354 enum dr_alignment_support supportable_dr_alignment;
1356 /* Ensure that all data refs can be vectorized after the peel. */
1357 FOR_EACH_VEC_ELT (datarefs, i, dr)
1359 int save_misalignment;
1361 if (dr == dr0)
1362 continue;
1364 stmt = DR_STMT (dr);
1365 stmt_info = vinfo_for_stmt (stmt);
1366 /* For interleaving, only the alignment of the first access
1367 matters. */
1368 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1369 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1370 continue;
1372 /* Strided accesses perform only component accesses, alignment is
1373 irrelevant for them. */
1374 if (STMT_VINFO_STRIDED_P (stmt_info)
1375 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1376 continue;
1378 save_misalignment = DR_MISALIGNMENT (dr);
1379 vect_update_misalignment_for_peel (dr, dr0, npeel);
1380 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1381 SET_DR_MISALIGNMENT (dr, save_misalignment);
1383 if (!supportable_dr_alignment)
1384 return false;
1387 return true;
1390 /* Function vect_enhance_data_refs_alignment
1392 This pass will use loop versioning and loop peeling in order to enhance
1393 the alignment of data references in the loop.
1395 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1396 original loop is to be vectorized. Any other loops that are created by
1397 the transformations performed in this pass - are not supposed to be
1398 vectorized. This restriction will be relaxed.
1400 This pass will require a cost model to guide it whether to apply peeling
1401 or versioning or a combination of the two. For example, the scheme that
1402 intel uses when given a loop with several memory accesses, is as follows:
1403 choose one memory access ('p') which alignment you want to force by doing
1404 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1405 other accesses are not necessarily aligned, or (2) use loop versioning to
1406 generate one loop in which all accesses are aligned, and another loop in
1407 which only 'p' is necessarily aligned.
1409 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1410 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1411 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1413 Devising a cost model is the most critical aspect of this work. It will
1414 guide us on which access to peel for, whether to use loop versioning, how
1415 many versions to create, etc. The cost model will probably consist of
1416 generic considerations as well as target specific considerations (on
1417 powerpc for example, misaligned stores are more painful than misaligned
1418 loads).
1420 Here are the general steps involved in alignment enhancements:
1422 -- original loop, before alignment analysis:
1423 for (i=0; i<N; i++){
1424 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1425 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1428 -- After vect_compute_data_refs_alignment:
1429 for (i=0; i<N; i++){
1430 x = q[i]; # DR_MISALIGNMENT(q) = 3
1431 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1434 -- Possibility 1: we do loop versioning:
1435 if (p is aligned) {
1436 for (i=0; i<N; i++){ # loop 1A
1437 x = q[i]; # DR_MISALIGNMENT(q) = 3
1438 p[i] = y; # DR_MISALIGNMENT(p) = 0
1441 else {
1442 for (i=0; i<N; i++){ # loop 1B
1443 x = q[i]; # DR_MISALIGNMENT(q) = 3
1444 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1448 -- Possibility 2: we do loop peeling:
1449 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1450 x = q[i];
1451 p[i] = y;
1453 for (i = 3; i < N; i++){ # loop 2A
1454 x = q[i]; # DR_MISALIGNMENT(q) = 0
1455 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1458 -- Possibility 3: combination of loop peeling and versioning:
1459 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1460 x = q[i];
1461 p[i] = y;
1463 if (p is aligned) {
1464 for (i = 3; i<N; i++){ # loop 3A
1465 x = q[i]; # DR_MISALIGNMENT(q) = 0
1466 p[i] = y; # DR_MISALIGNMENT(p) = 0
1469 else {
1470 for (i = 3; i<N; i++){ # loop 3B
1471 x = q[i]; # DR_MISALIGNMENT(q) = 0
1472 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1476 These loops are later passed to loop_transform to be vectorized. The
1477 vectorizer will use the alignment information to guide the transformation
1478 (whether to generate regular loads/stores, or with special handling for
1479 misalignment). */
1481 bool
1482 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1484 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1485 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1486 enum dr_alignment_support supportable_dr_alignment;
1487 struct data_reference *dr0 = NULL, *first_store = NULL;
1488 struct data_reference *dr;
1489 unsigned int i, j;
1490 bool do_peeling = false;
1491 bool do_versioning = false;
1492 bool stat;
1493 gimple *stmt;
1494 stmt_vec_info stmt_info;
1495 unsigned int npeel = 0;
1496 bool one_misalignment_known = false;
1497 bool one_misalignment_unknown = false;
1498 bool one_dr_unsupportable = false;
1499 struct data_reference *unsupportable_dr = NULL;
1500 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1501 unsigned possible_npeel_number = 1;
1502 tree vectype;
1503 unsigned int nelements, mis, same_align_drs_max = 0;
1504 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1505 hash_table<peel_info_hasher> peeling_htab (1);
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_NOTE, vect_location,
1509 "=== vect_enhance_data_refs_alignment ===\n");
1511 /* Reset data so we can safely be called multiple times. */
1512 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1513 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1515 /* While cost model enhancements are expected in the future, the high level
1516 view of the code at this time is as follows:
1518 A) If there is a misaligned access then see if peeling to align
1519 this access can make all data references satisfy
1520 vect_supportable_dr_alignment. If so, update data structures
1521 as needed and return true.
1523 B) If peeling wasn't possible and there is a data reference with an
1524 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1525 then see if loop versioning checks can be used to make all data
1526 references satisfy vect_supportable_dr_alignment. If so, update
1527 data structures as needed and return true.
1529 C) If neither peeling nor versioning were successful then return false if
1530 any data reference does not satisfy vect_supportable_dr_alignment.
1532 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1534 Note, Possibility 3 above (which is peeling and versioning together) is not
1535 being done at this time. */
1537 /* (1) Peeling to force alignment. */
1539 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1540 Considerations:
1541 + How many accesses will become aligned due to the peeling
1542 - How many accesses will become unaligned due to the peeling,
1543 and the cost of misaligned accesses.
1544 - The cost of peeling (the extra runtime checks, the increase
1545 in code size). */
1547 FOR_EACH_VEC_ELT (datarefs, i, dr)
1549 stmt = DR_STMT (dr);
1550 stmt_info = vinfo_for_stmt (stmt);
1552 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1553 continue;
1555 /* For interleaving, only the alignment of the first access
1556 matters. */
1557 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1558 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1559 continue;
1561 /* For invariant accesses there is nothing to enhance. */
1562 if (integer_zerop (DR_STEP (dr)))
1563 continue;
1565 /* Strided accesses perform only component accesses, alignment is
1566 irrelevant for them. */
1567 if (STMT_VINFO_STRIDED_P (stmt_info)
1568 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1569 continue;
1571 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1572 do_peeling = vector_alignment_reachable_p (dr);
1573 if (do_peeling)
1575 if (known_alignment_for_access_p (dr))
1577 unsigned int npeel_tmp = 0;
1578 bool negative = tree_int_cst_compare (DR_STEP (dr),
1579 size_zero_node) < 0;
1581 vectype = STMT_VINFO_VECTYPE (stmt_info);
1582 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1583 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1584 TREE_TYPE (DR_REF (dr))));
1585 if (DR_MISALIGNMENT (dr) != 0)
1586 npeel_tmp = (negative ? (mis - nelements)
1587 : (nelements - mis)) & (nelements - 1);
1589 /* For multiple types, it is possible that the bigger type access
1590 will have more than one peeling option. E.g., a loop with two
1591 types: one of size (vector size / 4), and the other one of
1592 size (vector size / 8). Vectorization factor will 8. If both
1593 accesses are misaligned by 3, the first one needs one scalar
1594 iteration to be aligned, and the second one needs 5. But the
1595 first one will be aligned also by peeling 5 scalar
1596 iterations, and in that case both accesses will be aligned.
1597 Hence, except for the immediate peeling amount, we also want
1598 to try to add full vector size, while we don't exceed
1599 vectorization factor.
1600 We do this automatically for cost model, since we calculate
1601 cost for every peeling option. */
1602 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1604 if (STMT_SLP_TYPE (stmt_info))
1605 possible_npeel_number
1606 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1607 else
1608 possible_npeel_number = vf / nelements;
1610 /* NPEEL_TMP is 0 when there is no misalignment, but also
1611 allow peeling NELEMENTS. */
1612 if (DR_MISALIGNMENT (dr) == 0)
1613 possible_npeel_number++;
1616 /* Save info about DR in the hash table. Also include peeling
1617 amounts according to the explanation above. */
1618 for (j = 0; j < possible_npeel_number; j++)
1620 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1621 dr, npeel_tmp);
1622 npeel_tmp += nelements;
1625 one_misalignment_known = true;
1627 else
1629 /* If we don't know any misalignment values, we prefer
1630 peeling for data-ref that has the maximum number of data-refs
1631 with the same alignment, unless the target prefers to align
1632 stores over load. */
1633 unsigned same_align_drs
1634 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1635 if (!dr0
1636 || same_align_drs_max < same_align_drs)
1638 same_align_drs_max = same_align_drs;
1639 dr0 = dr;
1641 /* For data-refs with the same number of related
1642 accesses prefer the one where the misalign
1643 computation will be invariant in the outermost loop. */
1644 else if (same_align_drs_max == same_align_drs)
1646 struct loop *ivloop0, *ivloop;
1647 ivloop0 = outermost_invariant_loop_for_expr
1648 (loop, DR_BASE_ADDRESS (dr0));
1649 ivloop = outermost_invariant_loop_for_expr
1650 (loop, DR_BASE_ADDRESS (dr));
1651 if ((ivloop && !ivloop0)
1652 || (ivloop && ivloop0
1653 && flow_loop_nested_p (ivloop, ivloop0)))
1654 dr0 = dr;
1657 one_misalignment_unknown = true;
1659 /* Check for data refs with unsupportable alignment that
1660 can be peeled. */
1661 if (!supportable_dr_alignment)
1663 one_dr_unsupportable = true;
1664 unsupportable_dr = dr;
1667 if (!first_store && DR_IS_WRITE (dr))
1668 first_store = dr;
1671 else
1673 if (!aligned_access_p (dr))
1675 if (dump_enabled_p ())
1676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677 "vector alignment may not be reachable\n");
1678 break;
1683 /* Check if we can possibly peel the loop. */
1684 if (!vect_can_advance_ivs_p (loop_vinfo)
1685 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1686 || loop->inner)
1687 do_peeling = false;
1689 struct _vect_peel_extended_info peel_for_known_alignment;
1690 struct _vect_peel_extended_info peel_for_unknown_alignment;
1691 struct _vect_peel_extended_info best_peel;
1693 peel_for_unknown_alignment.inside_cost = INT_MAX;
1694 peel_for_unknown_alignment.outside_cost = INT_MAX;
1695 peel_for_unknown_alignment.peel_info.count = 0;
1697 if (do_peeling
1698 && one_misalignment_unknown)
1700 /* Check if the target requires to prefer stores over loads, i.e., if
1701 misaligned stores are more expensive than misaligned loads (taking
1702 drs with same alignment into account). */
1703 unsigned int load_inside_cost = 0;
1704 unsigned int load_outside_cost = 0;
1705 unsigned int store_inside_cost = 0;
1706 unsigned int store_outside_cost = 0;
1708 stmt_vector_for_cost dummy;
1709 dummy.create (2);
1710 vect_get_peeling_costs_all_drs (dr0,
1711 &load_inside_cost,
1712 &load_outside_cost,
1713 &dummy, vf / 2, true);
1714 dummy.release ();
1716 if (first_store)
1718 dummy.create (2);
1719 vect_get_peeling_costs_all_drs (first_store,
1720 &store_inside_cost,
1721 &store_outside_cost,
1722 &dummy, vf / 2, true);
1723 dummy.release ();
1725 else
1727 store_inside_cost = INT_MAX;
1728 store_outside_cost = INT_MAX;
1731 if (load_inside_cost > store_inside_cost
1732 || (load_inside_cost == store_inside_cost
1733 && load_outside_cost > store_outside_cost))
1735 dr0 = first_store;
1736 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1737 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1739 else
1741 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1742 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1745 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1746 prologue_cost_vec.create (2);
1747 epilogue_cost_vec.create (2);
1749 int dummy2;
1750 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1751 (loop_vinfo, vf / 2, &dummy2,
1752 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1753 &prologue_cost_vec, &epilogue_cost_vec);
1755 prologue_cost_vec.release ();
1756 epilogue_cost_vec.release ();
1758 peel_for_unknown_alignment.peel_info.count = 1
1759 + STMT_VINFO_SAME_ALIGN_REFS
1760 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1763 peel_for_unknown_alignment.peel_info.npeel = 0;
1764 peel_for_unknown_alignment.peel_info.dr = dr0;
1766 best_peel = peel_for_unknown_alignment;
1768 peel_for_known_alignment.inside_cost = INT_MAX;
1769 peel_for_known_alignment.outside_cost = INT_MAX;
1770 peel_for_known_alignment.peel_info.count = 0;
1771 peel_for_known_alignment.peel_info.dr = NULL;
1773 if (do_peeling && one_misalignment_known)
1775 /* Peeling is possible, but there is no data access that is not supported
1776 unless aligned. So we try to choose the best possible peeling from
1777 the hash table. */
1778 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1779 (&peeling_htab, loop_vinfo, &npeel, &body_cost_vec);
1782 /* Compare costs of peeling for known and unknown alignment. */
1783 if (peel_for_known_alignment.peel_info.dr != NULL
1784 && peel_for_unknown_alignment.inside_cost
1785 >= peel_for_known_alignment.inside_cost)
1787 best_peel = peel_for_known_alignment;
1789 /* If the best peeling for known alignment has NPEEL == 0, perform no
1790 peeling at all except if there is an unsupportable dr that we can
1791 align. */
1792 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1793 do_peeling = false;
1796 /* If there is an unsupportable data ref, prefer this over all choices so far
1797 since we'd have to discard a chosen peeling except when it accidentally
1798 aligned the unsupportable data ref. */
1799 if (one_dr_unsupportable)
1800 dr0 = unsupportable_dr;
1801 else if (do_peeling)
1803 /* Calculate the penalty for no peeling, i.e. leaving everything
1804 unaligned.
1805 TODO: Adapt vect_get_peeling_costs_all_drs and use here. */
1806 unsigned nopeel_inside_cost = 0;
1807 unsigned nopeel_outside_cost = 0;
1809 stmt_vector_for_cost dummy;
1810 dummy.create (2);
1811 FOR_EACH_VEC_ELT (datarefs, i, dr)
1812 vect_get_data_access_cost (dr, &nopeel_inside_cost,
1813 &nopeel_outside_cost, &dummy);
1814 dummy.release ();
1816 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1817 costs will be recorded. */
1818 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1819 prologue_cost_vec.create (2);
1820 epilogue_cost_vec.create (2);
1822 int dummy2;
1823 nopeel_outside_cost += vect_get_known_peeling_cost
1824 (loop_vinfo, 0, &dummy2,
1825 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1826 &prologue_cost_vec, &epilogue_cost_vec);
1828 prologue_cost_vec.release ();
1829 epilogue_cost_vec.release ();
1831 npeel = best_peel.peel_info.npeel;
1832 dr0 = best_peel.peel_info.dr;
1834 /* If no peeling is not more expensive than the best peeling we
1835 have so far, don't perform any peeling. */
1836 if (nopeel_inside_cost <= best_peel.inside_cost)
1837 do_peeling = false;
1840 if (do_peeling)
1842 stmt = DR_STMT (dr0);
1843 stmt_info = vinfo_for_stmt (stmt);
1844 vectype = STMT_VINFO_VECTYPE (stmt_info);
1845 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1847 if (known_alignment_for_access_p (dr0))
1849 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1850 size_zero_node) < 0;
1851 if (!npeel)
1853 /* Since it's known at compile time, compute the number of
1854 iterations in the peeled loop (the peeling factor) for use in
1855 updating DR_MISALIGNMENT values. The peeling factor is the
1856 vectorization factor minus the misalignment as an element
1857 count. */
1858 mis = DR_MISALIGNMENT (dr0);
1859 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1860 npeel = ((negative ? mis - nelements : nelements - mis)
1861 & (nelements - 1));
1864 /* For interleaved data access every iteration accesses all the
1865 members of the group, therefore we divide the number of iterations
1866 by the group size. */
1867 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1868 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1869 npeel /= GROUP_SIZE (stmt_info);
1871 if (dump_enabled_p ())
1872 dump_printf_loc (MSG_NOTE, vect_location,
1873 "Try peeling by %d\n", npeel);
1876 /* Ensure that all datarefs can be vectorized after the peel. */
1877 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1878 do_peeling = false;
1880 /* Check if all datarefs are supportable and log. */
1881 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1883 stat = vect_verify_datarefs_alignment (loop_vinfo);
1884 if (!stat)
1885 do_peeling = false;
1886 else
1888 body_cost_vec.release ();
1889 return stat;
1893 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1894 if (do_peeling)
1896 unsigned max_allowed_peel
1897 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1898 if (max_allowed_peel != (unsigned)-1)
1900 unsigned max_peel = npeel;
1901 if (max_peel == 0)
1903 gimple *dr_stmt = DR_STMT (dr0);
1904 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1905 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1906 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1908 if (max_peel > max_allowed_peel)
1910 do_peeling = false;
1911 if (dump_enabled_p ())
1912 dump_printf_loc (MSG_NOTE, vect_location,
1913 "Disable peeling, max peels reached: %d\n", max_peel);
1918 /* Cost model #2 - if peeling may result in a remaining loop not
1919 iterating enough to be vectorized then do not peel. */
1920 if (do_peeling
1921 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1923 unsigned max_peel
1924 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1925 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1926 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1927 do_peeling = false;
1930 if (do_peeling)
1932 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1933 If the misalignment of DR_i is identical to that of dr0 then set
1934 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1935 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1936 by the peeling factor times the element size of DR_i (MOD the
1937 vectorization factor times the size). Otherwise, the
1938 misalignment of DR_i must be set to unknown. */
1939 FOR_EACH_VEC_ELT (datarefs, i, dr)
1940 if (dr != dr0)
1942 /* Strided accesses perform only component accesses, alignment
1943 is irrelevant for them. */
1944 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1945 if (STMT_VINFO_STRIDED_P (stmt_info)
1946 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1947 continue;
1949 vect_update_misalignment_for_peel (dr, dr0, npeel);
1952 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1953 if (npeel)
1954 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1955 else
1956 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1957 = DR_MISALIGNMENT (dr0);
1958 SET_DR_MISALIGNMENT (dr0, 0);
1959 if (dump_enabled_p ())
1961 dump_printf_loc (MSG_NOTE, vect_location,
1962 "Alignment of access forced using peeling.\n");
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "Peeling for alignment will be applied.\n");
1966 /* The inside-loop cost will be accounted for in vectorizable_load
1967 and vectorizable_store correctly with adjusted alignments.
1968 Drop the body_cst_vec on the floor here. */
1969 body_cost_vec.release ();
1971 stat = vect_verify_datarefs_alignment (loop_vinfo);
1972 gcc_assert (stat);
1973 return stat;
1977 body_cost_vec.release ();
1979 /* (2) Versioning to force alignment. */
1981 /* Try versioning if:
1982 1) optimize loop for speed
1983 2) there is at least one unsupported misaligned data ref with an unknown
1984 misalignment, and
1985 3) all misaligned data refs with a known misalignment are supported, and
1986 4) the number of runtime alignment checks is within reason. */
1988 do_versioning =
1989 optimize_loop_nest_for_speed_p (loop)
1990 && (!loop->inner); /* FORNOW */
1992 if (do_versioning)
1994 FOR_EACH_VEC_ELT (datarefs, i, dr)
1996 stmt = DR_STMT (dr);
1997 stmt_info = vinfo_for_stmt (stmt);
1999 /* For interleaving, only the alignment of the first access
2000 matters. */
2001 if (aligned_access_p (dr)
2002 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2003 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2004 continue;
2006 if (STMT_VINFO_STRIDED_P (stmt_info))
2008 /* Strided loads perform only component accesses, alignment is
2009 irrelevant for them. */
2010 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2011 continue;
2012 do_versioning = false;
2013 break;
2016 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2018 if (!supportable_dr_alignment)
2020 gimple *stmt;
2021 int mask;
2022 tree vectype;
2024 if (known_alignment_for_access_p (dr)
2025 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2026 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2028 do_versioning = false;
2029 break;
2032 stmt = DR_STMT (dr);
2033 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2034 gcc_assert (vectype);
2036 /* The rightmost bits of an aligned address must be zeros.
2037 Construct the mask needed for this test. For example,
2038 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2039 mask must be 15 = 0xf. */
2040 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2042 /* FORNOW: use the same mask to test all potentially unaligned
2043 references in the loop. The vectorizer currently supports
2044 a single vector size, see the reference to
2045 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2046 vectorization factor is computed. */
2047 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2048 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2049 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2050 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2051 DR_STMT (dr));
2055 /* Versioning requires at least one misaligned data reference. */
2056 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2057 do_versioning = false;
2058 else if (!do_versioning)
2059 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2062 if (do_versioning)
2064 vec<gimple *> may_misalign_stmts
2065 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2066 gimple *stmt;
2068 /* It can now be assumed that the data references in the statements
2069 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2070 of the loop being vectorized. */
2071 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2073 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2074 dr = STMT_VINFO_DATA_REF (stmt_info);
2075 SET_DR_MISALIGNMENT (dr, 0);
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_NOTE, vect_location,
2078 "Alignment of access forced using versioning.\n");
2081 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE, vect_location,
2083 "Versioning for alignment will be applied.\n");
2085 /* Peeling and versioning can't be done together at this time. */
2086 gcc_assert (! (do_peeling && do_versioning));
2088 stat = vect_verify_datarefs_alignment (loop_vinfo);
2089 gcc_assert (stat);
2090 return stat;
2093 /* This point is reached if neither peeling nor versioning is being done. */
2094 gcc_assert (! (do_peeling || do_versioning));
2096 stat = vect_verify_datarefs_alignment (loop_vinfo);
2097 return stat;
2101 /* Function vect_find_same_alignment_drs.
2103 Update group and alignment relations according to the chosen
2104 vectorization factor. */
2106 static void
2107 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2109 struct data_reference *dra = DDR_A (ddr);
2110 struct data_reference *drb = DDR_B (ddr);
2111 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2112 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2114 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2115 return;
2117 if (dra == drb)
2118 return;
2120 if (!operand_equal_p (DR_BASE_OBJECT (dra), DR_BASE_OBJECT (drb),
2121 OEP_ADDRESS_OF)
2122 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2123 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2124 return;
2126 /* Two references with distance zero have the same alignment. */
2127 offset_int diff = (wi::to_offset (DR_INIT (dra))
2128 - wi::to_offset (DR_INIT (drb)));
2129 if (diff != 0)
2131 /* Get the wider of the two alignments. */
2132 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2133 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2134 unsigned int max_align = MAX (align_a, align_b);
2136 /* Require the gap to be a multiple of the larger vector alignment. */
2137 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2138 return;
2141 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2142 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2143 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_NOTE, vect_location,
2146 "accesses have the same alignment: ");
2147 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2148 dump_printf (MSG_NOTE, " and ");
2149 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2150 dump_printf (MSG_NOTE, "\n");
2155 /* Function vect_analyze_data_refs_alignment
2157 Analyze the alignment of the data-references in the loop.
2158 Return FALSE if a data reference is found that cannot be vectorized. */
2160 bool
2161 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2163 if (dump_enabled_p ())
2164 dump_printf_loc (MSG_NOTE, vect_location,
2165 "=== vect_analyze_data_refs_alignment ===\n");
2167 /* Mark groups of data references with same alignment using
2168 data dependence information. */
2169 vec<ddr_p> ddrs = vinfo->ddrs;
2170 struct data_dependence_relation *ddr;
2171 unsigned int i;
2173 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2174 vect_find_same_alignment_drs (ddr);
2176 vec<data_reference_p> datarefs = vinfo->datarefs;
2177 struct data_reference *dr;
2179 FOR_EACH_VEC_ELT (datarefs, i, dr)
2181 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2182 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2183 && !vect_compute_data_ref_alignment (dr))
2185 /* Strided accesses perform only component accesses, misalignment
2186 information is irrelevant for them. */
2187 if (STMT_VINFO_STRIDED_P (stmt_info)
2188 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2189 continue;
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 "not vectorized: can't calculate alignment "
2194 "for data ref.\n");
2196 return false;
2200 return true;
2204 /* Analyze alignment of DRs of stmts in NODE. */
2206 static bool
2207 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2209 /* We vectorize from the first scalar stmt in the node unless
2210 the node is permuted in which case we start from the first
2211 element in the group. */
2212 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2213 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2214 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2215 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2217 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2218 if (! vect_compute_data_ref_alignment (dr)
2219 /* For creating the data-ref pointer we need alignment of the
2220 first element anyway. */
2221 || (dr != first_dr
2222 && ! vect_compute_data_ref_alignment (first_dr))
2223 || ! verify_data_ref_alignment (dr))
2225 if (dump_enabled_p ())
2226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2227 "not vectorized: bad data alignment in basic "
2228 "block.\n");
2229 return false;
2232 return true;
2235 /* Function vect_slp_analyze_instance_alignment
2237 Analyze the alignment of the data-references in the SLP instance.
2238 Return FALSE if a data reference is found that cannot be vectorized. */
2240 bool
2241 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_NOTE, vect_location,
2245 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2247 slp_tree node;
2248 unsigned i;
2249 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2250 if (! vect_slp_analyze_and_verify_node_alignment (node))
2251 return false;
2253 node = SLP_INSTANCE_TREE (instance);
2254 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2255 && ! vect_slp_analyze_and_verify_node_alignment
2256 (SLP_INSTANCE_TREE (instance)))
2257 return false;
2259 return true;
2263 /* Analyze groups of accesses: check that DR belongs to a group of
2264 accesses of legal size, step, etc. Detect gaps, single element
2265 interleaving, and other special cases. Set grouped access info.
2266 Collect groups of strided stores for further use in SLP analysis.
2267 Worker for vect_analyze_group_access. */
2269 static bool
2270 vect_analyze_group_access_1 (struct data_reference *dr)
2272 tree step = DR_STEP (dr);
2273 tree scalar_type = TREE_TYPE (DR_REF (dr));
2274 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2275 gimple *stmt = DR_STMT (dr);
2276 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2278 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2279 HOST_WIDE_INT dr_step = -1;
2280 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2281 bool slp_impossible = false;
2283 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2284 size of the interleaving group (including gaps). */
2285 if (tree_fits_shwi_p (step))
2287 dr_step = tree_to_shwi (step);
2288 /* Check that STEP is a multiple of type size. Otherwise there is
2289 a non-element-sized gap at the end of the group which we
2290 cannot represent in GROUP_GAP or GROUP_SIZE.
2291 ??? As we can handle non-constant step fine here we should
2292 simply remove uses of GROUP_GAP between the last and first
2293 element and instead rely on DR_STEP. GROUP_SIZE then would
2294 simply not include that gap. */
2295 if ((dr_step % type_size) != 0)
2297 if (dump_enabled_p ())
2299 dump_printf_loc (MSG_NOTE, vect_location,
2300 "Step ");
2301 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2302 dump_printf (MSG_NOTE,
2303 " is not a multiple of the element size for ");
2304 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2305 dump_printf (MSG_NOTE, "\n");
2307 return false;
2309 groupsize = absu_hwi (dr_step) / type_size;
2311 else
2312 groupsize = 0;
2314 /* Not consecutive access is possible only if it is a part of interleaving. */
2315 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2317 /* Check if it this DR is a part of interleaving, and is a single
2318 element of the group that is accessed in the loop. */
2320 /* Gaps are supported only for loads. STEP must be a multiple of the type
2321 size. The size of the group must be a power of 2. */
2322 if (DR_IS_READ (dr)
2323 && (dr_step % type_size) == 0
2324 && groupsize > 0
2325 && pow2p_hwi (groupsize))
2327 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2328 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2329 GROUP_GAP (stmt_info) = groupsize - 1;
2330 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE, vect_location,
2333 "Detected single element interleaving ");
2334 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2335 dump_printf (MSG_NOTE, " step ");
2336 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2337 dump_printf (MSG_NOTE, "\n");
2340 return true;
2343 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "not consecutive access ");
2347 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2350 if (bb_vinfo)
2352 /* Mark the statement as unvectorizable. */
2353 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2354 return true;
2357 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2358 STMT_VINFO_STRIDED_P (stmt_info) = true;
2359 return true;
2362 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2364 /* First stmt in the interleaving chain. Check the chain. */
2365 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2366 struct data_reference *data_ref = dr;
2367 unsigned int count = 1;
2368 tree prev_init = DR_INIT (data_ref);
2369 gimple *prev = stmt;
2370 HOST_WIDE_INT diff, gaps = 0;
2372 while (next)
2374 /* Skip same data-refs. In case that two or more stmts share
2375 data-ref (supported only for loads), we vectorize only the first
2376 stmt, and the rest get their vectorized loads from the first
2377 one. */
2378 if (!tree_int_cst_compare (DR_INIT (data_ref),
2379 DR_INIT (STMT_VINFO_DATA_REF (
2380 vinfo_for_stmt (next)))))
2382 if (DR_IS_WRITE (data_ref))
2384 if (dump_enabled_p ())
2385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2386 "Two store stmts share the same dr.\n");
2387 return false;
2390 if (dump_enabled_p ())
2391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2392 "Two or more load stmts share the same dr.\n");
2394 /* For load use the same data-ref load. */
2395 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2397 prev = next;
2398 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2399 continue;
2402 prev = next;
2403 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2405 /* All group members have the same STEP by construction. */
2406 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2408 /* Check that the distance between two accesses is equal to the type
2409 size. Otherwise, we have gaps. */
2410 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2411 - TREE_INT_CST_LOW (prev_init)) / type_size;
2412 if (diff != 1)
2414 /* FORNOW: SLP of accesses with gaps is not supported. */
2415 slp_impossible = true;
2416 if (DR_IS_WRITE (data_ref))
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2420 "interleaved store with gaps\n");
2421 return false;
2424 gaps += diff - 1;
2427 last_accessed_element += diff;
2429 /* Store the gap from the previous member of the group. If there is no
2430 gap in the access, GROUP_GAP is always 1. */
2431 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2433 prev_init = DR_INIT (data_ref);
2434 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2435 /* Count the number of data-refs in the chain. */
2436 count++;
2439 if (groupsize == 0)
2440 groupsize = count + gaps;
2442 /* This could be UINT_MAX but as we are generating code in a very
2443 inefficient way we have to cap earlier. See PR78699 for example. */
2444 if (groupsize > 4096)
2446 if (dump_enabled_p ())
2447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448 "group is too large\n");
2449 return false;
2452 /* Check that the size of the interleaving is equal to count for stores,
2453 i.e., that there are no gaps. */
2454 if (groupsize != count
2455 && !DR_IS_READ (dr))
2457 if (dump_enabled_p ())
2458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2459 "interleaved store with gaps\n");
2460 return false;
2463 /* If there is a gap after the last load in the group it is the
2464 difference between the groupsize and the last accessed
2465 element.
2466 When there is no gap, this difference should be 0. */
2467 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2469 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2470 if (dump_enabled_p ())
2472 dump_printf_loc (MSG_NOTE, vect_location,
2473 "Detected interleaving ");
2474 if (DR_IS_READ (dr))
2475 dump_printf (MSG_NOTE, "load ");
2476 else
2477 dump_printf (MSG_NOTE, "store ");
2478 dump_printf (MSG_NOTE, "of size %u starting with ",
2479 (unsigned)groupsize);
2480 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2481 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2482 dump_printf_loc (MSG_NOTE, vect_location,
2483 "There is a gap of %u elements after the group\n",
2484 GROUP_GAP (vinfo_for_stmt (stmt)));
2487 /* SLP: create an SLP data structure for every interleaving group of
2488 stores for further analysis in vect_analyse_slp. */
2489 if (DR_IS_WRITE (dr) && !slp_impossible)
2491 if (loop_vinfo)
2492 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2493 if (bb_vinfo)
2494 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2498 return true;
2501 /* Analyze groups of accesses: check that DR belongs to a group of
2502 accesses of legal size, step, etc. Detect gaps, single element
2503 interleaving, and other special cases. Set grouped access info.
2504 Collect groups of strided stores for further use in SLP analysis. */
2506 static bool
2507 vect_analyze_group_access (struct data_reference *dr)
2509 if (!vect_analyze_group_access_1 (dr))
2511 /* Dissolve the group if present. */
2512 gimple *next;
2513 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2514 while (stmt)
2516 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2517 next = GROUP_NEXT_ELEMENT (vinfo);
2518 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2519 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2520 stmt = next;
2522 return false;
2524 return true;
2527 /* Analyze the access pattern of the data-reference DR.
2528 In case of non-consecutive accesses call vect_analyze_group_access() to
2529 analyze groups of accesses. */
2531 static bool
2532 vect_analyze_data_ref_access (struct data_reference *dr)
2534 tree step = DR_STEP (dr);
2535 tree scalar_type = TREE_TYPE (DR_REF (dr));
2536 gimple *stmt = DR_STMT (dr);
2537 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2538 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2539 struct loop *loop = NULL;
2541 if (loop_vinfo)
2542 loop = LOOP_VINFO_LOOP (loop_vinfo);
2544 if (loop_vinfo && !step)
2546 if (dump_enabled_p ())
2547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2548 "bad data-ref access in loop\n");
2549 return false;
2552 /* Allow loads with zero step in inner-loop vectorization. */
2553 if (loop_vinfo && integer_zerop (step))
2555 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2556 if (!nested_in_vect_loop_p (loop, stmt))
2557 return DR_IS_READ (dr);
2558 /* Allow references with zero step for outer loops marked
2559 with pragma omp simd only - it guarantees absence of
2560 loop-carried dependencies between inner loop iterations. */
2561 if (!loop->force_vectorize)
2563 if (dump_enabled_p ())
2564 dump_printf_loc (MSG_NOTE, vect_location,
2565 "zero step in inner loop of nest\n");
2566 return false;
2570 if (loop && nested_in_vect_loop_p (loop, stmt))
2572 /* Interleaved accesses are not yet supported within outer-loop
2573 vectorization for references in the inner-loop. */
2574 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2576 /* For the rest of the analysis we use the outer-loop step. */
2577 step = STMT_VINFO_DR_STEP (stmt_info);
2578 if (integer_zerop (step))
2580 if (dump_enabled_p ())
2581 dump_printf_loc (MSG_NOTE, vect_location,
2582 "zero step in outer loop.\n");
2583 return DR_IS_READ (dr);
2587 /* Consecutive? */
2588 if (TREE_CODE (step) == INTEGER_CST)
2590 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2591 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2592 || (dr_step < 0
2593 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2595 /* Mark that it is not interleaving. */
2596 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2597 return true;
2601 if (loop && nested_in_vect_loop_p (loop, stmt))
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_NOTE, vect_location,
2605 "grouped access in outer loop.\n");
2606 return false;
2610 /* Assume this is a DR handled by non-constant strided load case. */
2611 if (TREE_CODE (step) != INTEGER_CST)
2612 return (STMT_VINFO_STRIDED_P (stmt_info)
2613 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2614 || vect_analyze_group_access (dr)));
2616 /* Not consecutive access - check if it's a part of interleaving group. */
2617 return vect_analyze_group_access (dr);
2620 /* Compare two data-references DRA and DRB to group them into chunks
2621 suitable for grouping. */
2623 static int
2624 dr_group_sort_cmp (const void *dra_, const void *drb_)
2626 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2627 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2628 int cmp;
2630 /* Stabilize sort. */
2631 if (dra == drb)
2632 return 0;
2634 /* DRs in different loops never belong to the same group. */
2635 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2636 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2637 if (loopa != loopb)
2638 return loopa->num < loopb->num ? -1 : 1;
2640 /* Ordering of DRs according to base. */
2641 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2643 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2644 DR_BASE_ADDRESS (drb));
2645 if (cmp != 0)
2646 return cmp;
2649 /* And according to DR_OFFSET. */
2650 if (!dr_equal_offsets_p (dra, drb))
2652 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2653 if (cmp != 0)
2654 return cmp;
2657 /* Put reads before writes. */
2658 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2659 return DR_IS_READ (dra) ? -1 : 1;
2661 /* Then sort after access size. */
2662 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2663 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2665 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2666 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2667 if (cmp != 0)
2668 return cmp;
2671 /* And after step. */
2672 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2674 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2675 if (cmp != 0)
2676 return cmp;
2679 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2680 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2681 if (cmp == 0)
2682 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2683 return cmp;
2686 /* Function vect_analyze_data_ref_accesses.
2688 Analyze the access pattern of all the data references in the loop.
2690 FORNOW: the only access pattern that is considered vectorizable is a
2691 simple step 1 (consecutive) access.
2693 FORNOW: handle only arrays and pointer accesses. */
2695 bool
2696 vect_analyze_data_ref_accesses (vec_info *vinfo)
2698 unsigned int i;
2699 vec<data_reference_p> datarefs = vinfo->datarefs;
2700 struct data_reference *dr;
2702 if (dump_enabled_p ())
2703 dump_printf_loc (MSG_NOTE, vect_location,
2704 "=== vect_analyze_data_ref_accesses ===\n");
2706 if (datarefs.is_empty ())
2707 return true;
2709 /* Sort the array of datarefs to make building the interleaving chains
2710 linear. Don't modify the original vector's order, it is needed for
2711 determining what dependencies are reversed. */
2712 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2713 datarefs_copy.qsort (dr_group_sort_cmp);
2715 /* Build the interleaving chains. */
2716 for (i = 0; i < datarefs_copy.length () - 1;)
2718 data_reference_p dra = datarefs_copy[i];
2719 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2720 stmt_vec_info lastinfo = NULL;
2721 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2723 ++i;
2724 continue;
2726 for (i = i + 1; i < datarefs_copy.length (); ++i)
2728 data_reference_p drb = datarefs_copy[i];
2729 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2730 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2731 break;
2733 /* ??? Imperfect sorting (non-compatible types, non-modulo
2734 accesses, same accesses) can lead to a group to be artificially
2735 split here as we don't just skip over those. If it really
2736 matters we can push those to a worklist and re-iterate
2737 over them. The we can just skip ahead to the next DR here. */
2739 /* DRs in a different loop should not be put into the same
2740 interleaving group. */
2741 if (gimple_bb (DR_STMT (dra))->loop_father
2742 != gimple_bb (DR_STMT (drb))->loop_father)
2743 break;
2745 /* Check that the data-refs have same first location (except init)
2746 and they are both either store or load (not load and store,
2747 not masked loads or stores). */
2748 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2749 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2750 DR_BASE_ADDRESS (drb), 0)
2751 || !dr_equal_offsets_p (dra, drb)
2752 || !gimple_assign_single_p (DR_STMT (dra))
2753 || !gimple_assign_single_p (DR_STMT (drb)))
2754 break;
2756 /* Check that the data-refs have the same constant size. */
2757 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2758 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2759 if (!tree_fits_uhwi_p (sza)
2760 || !tree_fits_uhwi_p (szb)
2761 || !tree_int_cst_equal (sza, szb))
2762 break;
2764 /* Check that the data-refs have the same step. */
2765 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2766 break;
2768 /* Do not place the same access in the interleaving chain twice. */
2769 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2770 break;
2772 /* Check the types are compatible.
2773 ??? We don't distinguish this during sorting. */
2774 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2775 TREE_TYPE (DR_REF (drb))))
2776 break;
2778 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2779 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2780 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2781 gcc_assert (init_a <= init_b);
2783 /* If init_b == init_a + the size of the type * k, we have an
2784 interleaving, and DRA is accessed before DRB. */
2785 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2786 if (type_size_a == 0
2787 || (init_b - init_a) % type_size_a != 0)
2788 break;
2790 /* If we have a store, the accesses are adjacent. This splits
2791 groups into chunks we support (we don't support vectorization
2792 of stores with gaps). */
2793 if (!DR_IS_READ (dra)
2794 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2795 (DR_INIT (datarefs_copy[i-1]))
2796 != type_size_a))
2797 break;
2799 /* If the step (if not zero or non-constant) is greater than the
2800 difference between data-refs' inits this splits groups into
2801 suitable sizes. */
2802 if (tree_fits_shwi_p (DR_STEP (dra)))
2804 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2805 if (step != 0 && step <= (init_b - init_a))
2806 break;
2809 if (dump_enabled_p ())
2811 dump_printf_loc (MSG_NOTE, vect_location,
2812 "Detected interleaving ");
2813 if (DR_IS_READ (dra))
2814 dump_printf (MSG_NOTE, "load ");
2815 else
2816 dump_printf (MSG_NOTE, "store ");
2817 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2818 dump_printf (MSG_NOTE, " and ");
2819 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2820 dump_printf (MSG_NOTE, "\n");
2823 /* Link the found element into the group list. */
2824 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2826 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2827 lastinfo = stmtinfo_a;
2829 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2830 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2831 lastinfo = stmtinfo_b;
2835 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2836 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2837 && !vect_analyze_data_ref_access (dr))
2839 if (dump_enabled_p ())
2840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2841 "not vectorized: complicated access pattern.\n");
2843 if (is_a <bb_vec_info> (vinfo))
2845 /* Mark the statement as not vectorizable. */
2846 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2847 continue;
2849 else
2851 datarefs_copy.release ();
2852 return false;
2856 datarefs_copy.release ();
2857 return true;
2860 /* Function vect_vfa_segment_size.
2862 Create an expression that computes the size of segment
2863 that will be accessed for a data reference. The functions takes into
2864 account that realignment loads may access one more vector.
2866 Input:
2867 DR: The data reference.
2868 LENGTH_FACTOR: segment length to consider.
2870 Return an expression whose value is the size of segment which will be
2871 accessed by DR. */
2873 static tree
2874 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2876 tree segment_length;
2878 if (integer_zerop (DR_STEP (dr)))
2879 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2880 else
2881 segment_length = size_binop (MULT_EXPR,
2882 fold_convert (sizetype, DR_STEP (dr)),
2883 fold_convert (sizetype, length_factor));
2885 if (vect_supportable_dr_alignment (dr, false)
2886 == dr_explicit_realign_optimized)
2888 tree vector_size = TYPE_SIZE_UNIT
2889 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2891 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2893 return segment_length;
2896 /* Function vect_no_alias_p.
2898 Given data references A and B with equal base and offset, the alias
2899 relation can be decided at compilation time, return TRUE if they do
2900 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2901 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2902 respectively. */
2904 static bool
2905 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2906 tree segment_length_a, tree segment_length_b)
2908 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2909 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2910 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2911 return false;
2913 tree seg_a_min = DR_INIT (a);
2914 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2915 seg_a_min, segment_length_a);
2916 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2917 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2918 [a, a+12) */
2919 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2921 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2922 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2923 seg_a_max, unit_size);
2924 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2925 DR_INIT (a), unit_size);
2927 tree seg_b_min = DR_INIT (b);
2928 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2929 seg_b_min, segment_length_b);
2930 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2932 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2933 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2934 seg_b_max, unit_size);
2935 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2936 DR_INIT (b), unit_size);
2939 if (tree_int_cst_le (seg_a_max, seg_b_min)
2940 || tree_int_cst_le (seg_b_max, seg_a_min))
2941 return true;
2943 return false;
2946 /* Function vect_prune_runtime_alias_test_list.
2948 Prune a list of ddrs to be tested at run-time by versioning for alias.
2949 Merge several alias checks into one if possible.
2950 Return FALSE if resulting list of ddrs is longer then allowed by
2951 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2953 bool
2954 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2956 vec<ddr_p> may_alias_ddrs =
2957 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2958 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2959 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2960 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2961 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2963 ddr_p ddr;
2964 unsigned int i;
2965 tree length_factor;
2967 if (dump_enabled_p ())
2968 dump_printf_loc (MSG_NOTE, vect_location,
2969 "=== vect_prune_runtime_alias_test_list ===\n");
2971 if (may_alias_ddrs.is_empty ())
2972 return true;
2974 comp_alias_ddrs.create (may_alias_ddrs.length ());
2976 /* First, we collect all data ref pairs for aliasing checks. */
2977 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2979 int comp_res;
2980 struct data_reference *dr_a, *dr_b;
2981 gimple *dr_group_first_a, *dr_group_first_b;
2982 tree segment_length_a, segment_length_b;
2983 gimple *stmt_a, *stmt_b;
2985 dr_a = DDR_A (ddr);
2986 stmt_a = DR_STMT (DDR_A (ddr));
2987 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2988 if (dr_group_first_a)
2990 stmt_a = dr_group_first_a;
2991 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2994 dr_b = DDR_B (ddr);
2995 stmt_b = DR_STMT (DDR_B (ddr));
2996 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2997 if (dr_group_first_b)
2999 stmt_b = dr_group_first_b;
3000 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3003 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3004 length_factor = scalar_loop_iters;
3005 else
3006 length_factor = size_int (vect_factor);
3007 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3008 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3010 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3011 DR_BASE_ADDRESS (dr_b));
3012 if (comp_res == 0)
3013 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3014 DR_OFFSET (dr_b));
3016 /* Alias is known at compilation time. */
3017 if (comp_res == 0
3018 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3019 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3020 && TREE_CODE (segment_length_a) == INTEGER_CST
3021 && TREE_CODE (segment_length_b) == INTEGER_CST)
3023 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3024 continue;
3026 if (dump_enabled_p ())
3027 dump_printf_loc (MSG_NOTE, vect_location,
3028 "not vectorized: compilation time alias.\n");
3030 return false;
3033 dr_with_seg_len_pair_t dr_with_seg_len_pair
3034 (dr_with_seg_len (dr_a, segment_length_a),
3035 dr_with_seg_len (dr_b, segment_length_b));
3037 /* Canonicalize pairs by sorting the two DR members. */
3038 if (comp_res > 0)
3039 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3041 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3044 prune_runtime_alias_test_list (&comp_alias_ddrs,
3045 (unsigned HOST_WIDE_INT) vect_factor);
3046 dump_printf_loc (MSG_NOTE, vect_location,
3047 "improved number of alias checks from %d to %d\n",
3048 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3049 if ((int) comp_alias_ddrs.length () >
3050 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3052 if (dump_enabled_p ())
3053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3054 "number of versioning for alias "
3055 "run-time tests exceeds %d "
3056 "(--param vect-max-version-for-alias-checks)\n",
3057 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3058 return false;
3061 /* All alias checks have been resolved at compilation time. */
3062 if (!comp_alias_ddrs.length ())
3063 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3065 return true;
3068 /* Return true if a non-affine read or write in STMT is suitable for a
3069 gather load or scatter store. Describe the operation in *INFO if so. */
3071 bool
3072 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3073 gather_scatter_info *info)
3075 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3076 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3077 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3078 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3079 tree offtype = NULL_TREE;
3080 tree decl, base, off;
3081 machine_mode pmode;
3082 int punsignedp, reversep, pvolatilep = 0;
3084 base = DR_REF (dr);
3085 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3086 see if we can use the def stmt of the address. */
3087 if (is_gimple_call (stmt)
3088 && gimple_call_internal_p (stmt)
3089 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3090 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3091 && TREE_CODE (base) == MEM_REF
3092 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3093 && integer_zerop (TREE_OPERAND (base, 1))
3094 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3096 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3097 if (is_gimple_assign (def_stmt)
3098 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3099 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3102 /* The gather and scatter builtins need address of the form
3103 loop_invariant + vector * {1, 2, 4, 8}
3105 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3106 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3107 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3108 multiplications and additions in it. To get a vector, we need
3109 a single SSA_NAME that will be defined in the loop and will
3110 contain everything that is not loop invariant and that can be
3111 vectorized. The following code attempts to find such a preexistng
3112 SSA_NAME OFF and put the loop invariants into a tree BASE
3113 that can be gimplified before the loop. */
3114 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3115 &punsignedp, &reversep, &pvolatilep);
3116 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3118 if (TREE_CODE (base) == MEM_REF)
3120 if (!integer_zerop (TREE_OPERAND (base, 1)))
3122 if (off == NULL_TREE)
3124 offset_int moff = mem_ref_offset (base);
3125 off = wide_int_to_tree (sizetype, moff);
3127 else
3128 off = size_binop (PLUS_EXPR, off,
3129 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3131 base = TREE_OPERAND (base, 0);
3133 else
3134 base = build_fold_addr_expr (base);
3136 if (off == NULL_TREE)
3137 off = size_zero_node;
3139 /* If base is not loop invariant, either off is 0, then we start with just
3140 the constant offset in the loop invariant BASE and continue with base
3141 as OFF, otherwise give up.
3142 We could handle that case by gimplifying the addition of base + off
3143 into some SSA_NAME and use that as off, but for now punt. */
3144 if (!expr_invariant_in_loop_p (loop, base))
3146 if (!integer_zerop (off))
3147 return false;
3148 off = base;
3149 base = size_int (pbitpos / BITS_PER_UNIT);
3151 /* Otherwise put base + constant offset into the loop invariant BASE
3152 and continue with OFF. */
3153 else
3155 base = fold_convert (sizetype, base);
3156 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3159 /* OFF at this point may be either a SSA_NAME or some tree expression
3160 from get_inner_reference. Try to peel off loop invariants from it
3161 into BASE as long as possible. */
3162 STRIP_NOPS (off);
3163 while (offtype == NULL_TREE)
3165 enum tree_code code;
3166 tree op0, op1, add = NULL_TREE;
3168 if (TREE_CODE (off) == SSA_NAME)
3170 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3172 if (expr_invariant_in_loop_p (loop, off))
3173 return false;
3175 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3176 break;
3178 op0 = gimple_assign_rhs1 (def_stmt);
3179 code = gimple_assign_rhs_code (def_stmt);
3180 op1 = gimple_assign_rhs2 (def_stmt);
3182 else
3184 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3185 return false;
3186 code = TREE_CODE (off);
3187 extract_ops_from_tree (off, &code, &op0, &op1);
3189 switch (code)
3191 case POINTER_PLUS_EXPR:
3192 case PLUS_EXPR:
3193 if (expr_invariant_in_loop_p (loop, op0))
3195 add = op0;
3196 off = op1;
3197 do_add:
3198 add = fold_convert (sizetype, add);
3199 if (scale != 1)
3200 add = size_binop (MULT_EXPR, add, size_int (scale));
3201 base = size_binop (PLUS_EXPR, base, add);
3202 continue;
3204 if (expr_invariant_in_loop_p (loop, op1))
3206 add = op1;
3207 off = op0;
3208 goto do_add;
3210 break;
3211 case MINUS_EXPR:
3212 if (expr_invariant_in_loop_p (loop, op1))
3214 add = fold_convert (sizetype, op1);
3215 add = size_binop (MINUS_EXPR, size_zero_node, add);
3216 off = op0;
3217 goto do_add;
3219 break;
3220 case MULT_EXPR:
3221 if (scale == 1 && tree_fits_shwi_p (op1))
3223 scale = tree_to_shwi (op1);
3224 off = op0;
3225 continue;
3227 break;
3228 case SSA_NAME:
3229 off = op0;
3230 continue;
3231 CASE_CONVERT:
3232 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3233 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3234 break;
3235 if (TYPE_PRECISION (TREE_TYPE (op0))
3236 == TYPE_PRECISION (TREE_TYPE (off)))
3238 off = op0;
3239 continue;
3241 if (TYPE_PRECISION (TREE_TYPE (op0))
3242 < TYPE_PRECISION (TREE_TYPE (off)))
3244 off = op0;
3245 offtype = TREE_TYPE (off);
3246 STRIP_NOPS (off);
3247 continue;
3249 break;
3250 default:
3251 break;
3253 break;
3256 /* If at the end OFF still isn't a SSA_NAME or isn't
3257 defined in the loop, punt. */
3258 if (TREE_CODE (off) != SSA_NAME
3259 || expr_invariant_in_loop_p (loop, off))
3260 return false;
3262 if (offtype == NULL_TREE)
3263 offtype = TREE_TYPE (off);
3265 if (DR_IS_READ (dr))
3266 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3267 offtype, scale);
3268 else
3269 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3270 offtype, scale);
3272 if (decl == NULL_TREE)
3273 return false;
3275 info->decl = decl;
3276 info->base = base;
3277 info->offset = off;
3278 info->offset_dt = vect_unknown_def_type;
3279 info->offset_vectype = NULL_TREE;
3280 info->scale = scale;
3281 return true;
3284 /* Function vect_analyze_data_refs.
3286 Find all the data references in the loop or basic block.
3288 The general structure of the analysis of data refs in the vectorizer is as
3289 follows:
3290 1- vect_analyze_data_refs(loop/bb): call
3291 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3292 in the loop/bb and their dependences.
3293 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3294 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3295 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3299 bool
3300 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3302 struct loop *loop = NULL;
3303 unsigned int i;
3304 struct data_reference *dr;
3305 tree scalar_type;
3307 if (dump_enabled_p ())
3308 dump_printf_loc (MSG_NOTE, vect_location,
3309 "=== vect_analyze_data_refs ===\n");
3311 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3312 loop = LOOP_VINFO_LOOP (loop_vinfo);
3314 /* Go through the data-refs, check that the analysis succeeded. Update
3315 pointer from stmt_vec_info struct to DR and vectype. */
3317 vec<data_reference_p> datarefs = vinfo->datarefs;
3318 FOR_EACH_VEC_ELT (datarefs, i, dr)
3320 gimple *stmt;
3321 stmt_vec_info stmt_info;
3322 tree base, offset, init;
3323 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3324 bool simd_lane_access = false;
3325 int vf;
3327 again:
3328 if (!dr || !DR_REF (dr))
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3332 "not vectorized: unhandled data-ref\n");
3333 return false;
3336 stmt = DR_STMT (dr);
3337 stmt_info = vinfo_for_stmt (stmt);
3339 /* Discard clobbers from the dataref vector. We will remove
3340 clobber stmts during vectorization. */
3341 if (gimple_clobber_p (stmt))
3343 free_data_ref (dr);
3344 if (i == datarefs.length () - 1)
3346 datarefs.pop ();
3347 break;
3349 datarefs.ordered_remove (i);
3350 dr = datarefs[i];
3351 goto again;
3354 /* Check that analysis of the data-ref succeeded. */
3355 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3356 || !DR_STEP (dr))
3358 bool maybe_gather
3359 = DR_IS_READ (dr)
3360 && !TREE_THIS_VOLATILE (DR_REF (dr))
3361 && targetm.vectorize.builtin_gather != NULL;
3362 bool maybe_scatter
3363 = DR_IS_WRITE (dr)
3364 && !TREE_THIS_VOLATILE (DR_REF (dr))
3365 && targetm.vectorize.builtin_scatter != NULL;
3366 bool maybe_simd_lane_access
3367 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3369 /* If target supports vector gather loads or scatter stores, or if
3370 this might be a SIMD lane access, see if they can't be used. */
3371 if (is_a <loop_vec_info> (vinfo)
3372 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3373 && !nested_in_vect_loop_p (loop, stmt))
3375 struct data_reference *newdr
3376 = create_data_ref (NULL, loop_containing_stmt (stmt),
3377 DR_REF (dr), stmt, maybe_scatter ? false : true);
3378 gcc_assert (newdr != NULL && DR_REF (newdr));
3379 if (DR_BASE_ADDRESS (newdr)
3380 && DR_OFFSET (newdr)
3381 && DR_INIT (newdr)
3382 && DR_STEP (newdr)
3383 && integer_zerop (DR_STEP (newdr)))
3385 if (maybe_simd_lane_access)
3387 tree off = DR_OFFSET (newdr);
3388 STRIP_NOPS (off);
3389 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3390 && TREE_CODE (off) == MULT_EXPR
3391 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3393 tree step = TREE_OPERAND (off, 1);
3394 off = TREE_OPERAND (off, 0);
3395 STRIP_NOPS (off);
3396 if (CONVERT_EXPR_P (off)
3397 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3398 0)))
3399 < TYPE_PRECISION (TREE_TYPE (off)))
3400 off = TREE_OPERAND (off, 0);
3401 if (TREE_CODE (off) == SSA_NAME)
3403 gimple *def = SSA_NAME_DEF_STMT (off);
3404 tree reft = TREE_TYPE (DR_REF (newdr));
3405 if (is_gimple_call (def)
3406 && gimple_call_internal_p (def)
3407 && (gimple_call_internal_fn (def)
3408 == IFN_GOMP_SIMD_LANE))
3410 tree arg = gimple_call_arg (def, 0);
3411 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3412 arg = SSA_NAME_VAR (arg);
3413 if (arg == loop->simduid
3414 /* For now. */
3415 && tree_int_cst_equal
3416 (TYPE_SIZE_UNIT (reft),
3417 step))
3419 DR_OFFSET (newdr) = ssize_int (0);
3420 DR_STEP (newdr) = step;
3421 DR_ALIGNED_TO (newdr)
3422 = size_int (BIGGEST_ALIGNMENT);
3423 dr = newdr;
3424 simd_lane_access = true;
3430 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3432 dr = newdr;
3433 if (maybe_gather)
3434 gatherscatter = GATHER;
3435 else
3436 gatherscatter = SCATTER;
3439 if (gatherscatter == SG_NONE && !simd_lane_access)
3440 free_data_ref (newdr);
3443 if (gatherscatter == SG_NONE && !simd_lane_access)
3445 if (dump_enabled_p ())
3447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3448 "not vectorized: data ref analysis "
3449 "failed ");
3450 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3453 if (is_a <bb_vec_info> (vinfo))
3454 break;
3456 return false;
3460 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3462 if (dump_enabled_p ())
3463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3464 "not vectorized: base addr of dr is a "
3465 "constant\n");
3467 if (is_a <bb_vec_info> (vinfo))
3468 break;
3470 if (gatherscatter != SG_NONE || simd_lane_access)
3471 free_data_ref (dr);
3472 return false;
3475 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3477 if (dump_enabled_p ())
3479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3480 "not vectorized: volatile type ");
3481 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3484 if (is_a <bb_vec_info> (vinfo))
3485 break;
3487 return false;
3490 if (stmt_can_throw_internal (stmt))
3492 if (dump_enabled_p ())
3494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3495 "not vectorized: statement can throw an "
3496 "exception ");
3497 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3500 if (is_a <bb_vec_info> (vinfo))
3501 break;
3503 if (gatherscatter != SG_NONE || simd_lane_access)
3504 free_data_ref (dr);
3505 return false;
3508 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3509 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3511 if (dump_enabled_p ())
3513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3514 "not vectorized: statement is bitfield "
3515 "access ");
3516 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3519 if (is_a <bb_vec_info> (vinfo))
3520 break;
3522 if (gatherscatter != SG_NONE || simd_lane_access)
3523 free_data_ref (dr);
3524 return false;
3527 base = unshare_expr (DR_BASE_ADDRESS (dr));
3528 offset = unshare_expr (DR_OFFSET (dr));
3529 init = unshare_expr (DR_INIT (dr));
3531 if (is_gimple_call (stmt)
3532 && (!gimple_call_internal_p (stmt)
3533 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3534 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3536 if (dump_enabled_p ())
3538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3539 "not vectorized: dr in a call ");
3540 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3543 if (is_a <bb_vec_info> (vinfo))
3544 break;
3546 if (gatherscatter != SG_NONE || simd_lane_access)
3547 free_data_ref (dr);
3548 return false;
3551 /* Update DR field in stmt_vec_info struct. */
3553 /* If the dataref is in an inner-loop of the loop that is considered for
3554 for vectorization, we also want to analyze the access relative to
3555 the outer-loop (DR contains information only relative to the
3556 inner-most enclosing loop). We do that by building a reference to the
3557 first location accessed by the inner-loop, and analyze it relative to
3558 the outer-loop. */
3559 if (loop && nested_in_vect_loop_p (loop, stmt))
3561 tree outer_step, outer_base, outer_init;
3562 HOST_WIDE_INT pbitsize, pbitpos;
3563 tree poffset;
3564 machine_mode pmode;
3565 int punsignedp, preversep, pvolatilep;
3566 affine_iv base_iv, offset_iv;
3567 tree dinit;
3569 /* Build a reference to the first location accessed by the
3570 inner-loop: *(BASE+INIT). (The first location is actually
3571 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3572 tree inner_base = build_fold_indirect_ref
3573 (fold_build_pointer_plus (base, init));
3575 if (dump_enabled_p ())
3577 dump_printf_loc (MSG_NOTE, vect_location,
3578 "analyze in outer-loop: ");
3579 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3580 dump_printf (MSG_NOTE, "\n");
3583 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3584 &poffset, &pmode, &punsignedp,
3585 &preversep, &pvolatilep);
3586 gcc_assert (outer_base != NULL_TREE);
3588 if (pbitpos % BITS_PER_UNIT != 0)
3590 if (dump_enabled_p ())
3591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3592 "failed: bit offset alignment.\n");
3593 return false;
3596 if (preversep)
3598 if (dump_enabled_p ())
3599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3600 "failed: reverse storage order.\n");
3601 return false;
3604 outer_base = build_fold_addr_expr (outer_base);
3605 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3606 &base_iv, false))
3608 if (dump_enabled_p ())
3609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3610 "failed: evolution of base is not affine.\n");
3611 return false;
3614 if (offset)
3616 if (poffset)
3617 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3618 poffset);
3619 else
3620 poffset = offset;
3623 if (!poffset)
3625 offset_iv.base = ssize_int (0);
3626 offset_iv.step = ssize_int (0);
3628 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3629 &offset_iv, false))
3631 if (dump_enabled_p ())
3632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3633 "evolution of offset is not affine.\n");
3634 return false;
3637 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3638 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3639 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3640 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3641 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3643 outer_step = size_binop (PLUS_EXPR,
3644 fold_convert (ssizetype, base_iv.step),
3645 fold_convert (ssizetype, offset_iv.step));
3647 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3648 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3649 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3650 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3651 STMT_VINFO_DR_OFFSET (stmt_info) =
3652 fold_convert (ssizetype, offset_iv.base);
3653 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3654 size_int (highest_pow2_factor (offset_iv.base));
3656 if (dump_enabled_p ())
3658 dump_printf_loc (MSG_NOTE, vect_location,
3659 "\touter base_address: ");
3660 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3661 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3662 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3663 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3664 STMT_VINFO_DR_OFFSET (stmt_info));
3665 dump_printf (MSG_NOTE,
3666 "\n\touter constant offset from base address: ");
3667 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3668 STMT_VINFO_DR_INIT (stmt_info));
3669 dump_printf (MSG_NOTE, "\n\touter step: ");
3670 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3671 STMT_VINFO_DR_STEP (stmt_info));
3672 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3673 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3674 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3675 dump_printf (MSG_NOTE, "\n");
3679 if (STMT_VINFO_DATA_REF (stmt_info))
3681 if (dump_enabled_p ())
3683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3684 "not vectorized: more than one data ref "
3685 "in stmt: ");
3686 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3689 if (is_a <bb_vec_info> (vinfo))
3690 break;
3692 if (gatherscatter != SG_NONE || simd_lane_access)
3693 free_data_ref (dr);
3694 return false;
3697 STMT_VINFO_DATA_REF (stmt_info) = dr;
3698 if (simd_lane_access)
3700 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3701 free_data_ref (datarefs[i]);
3702 datarefs[i] = dr;
3705 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3706 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3707 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3709 if (dump_enabled_p ())
3711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3712 "not vectorized: base object not addressable "
3713 "for stmt: ");
3714 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3716 if (is_a <bb_vec_info> (vinfo))
3718 /* In BB vectorization the ref can still participate
3719 in dependence analysis, we just can't vectorize it. */
3720 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3721 continue;
3723 return false;
3726 /* Set vectype for STMT. */
3727 scalar_type = TREE_TYPE (DR_REF (dr));
3728 STMT_VINFO_VECTYPE (stmt_info)
3729 = get_vectype_for_scalar_type (scalar_type);
3730 if (!STMT_VINFO_VECTYPE (stmt_info))
3732 if (dump_enabled_p ())
3734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3735 "not vectorized: no vectype for stmt: ");
3736 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3737 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3738 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3739 scalar_type);
3740 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3743 if (is_a <bb_vec_info> (vinfo))
3745 /* No vector type is fine, the ref can still participate
3746 in dependence analysis, we just can't vectorize it. */
3747 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3748 continue;
3751 if (gatherscatter != SG_NONE || simd_lane_access)
3753 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3754 if (gatherscatter != SG_NONE)
3755 free_data_ref (dr);
3757 return false;
3759 else
3761 if (dump_enabled_p ())
3763 dump_printf_loc (MSG_NOTE, vect_location,
3764 "got vectype for stmt: ");
3765 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3766 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3767 STMT_VINFO_VECTYPE (stmt_info));
3768 dump_printf (MSG_NOTE, "\n");
3772 /* Adjust the minimal vectorization factor according to the
3773 vector type. */
3774 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3775 if (vf > *min_vf)
3776 *min_vf = vf;
3778 if (gatherscatter != SG_NONE)
3780 gather_scatter_info gs_info;
3781 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3782 &gs_info)
3783 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3785 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3786 free_data_ref (dr);
3787 if (dump_enabled_p ())
3789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3790 (gatherscatter == GATHER) ?
3791 "not vectorized: not suitable for gather "
3792 "load " :
3793 "not vectorized: not suitable for scatter "
3794 "store ");
3795 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3797 return false;
3800 free_data_ref (datarefs[i]);
3801 datarefs[i] = dr;
3802 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3805 else if (is_a <loop_vec_info> (vinfo)
3806 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3808 if (nested_in_vect_loop_p (loop, stmt))
3810 if (dump_enabled_p ())
3812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813 "not vectorized: not suitable for strided "
3814 "load ");
3815 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3817 return false;
3819 STMT_VINFO_STRIDED_P (stmt_info) = true;
3823 /* If we stopped analysis at the first dataref we could not analyze
3824 when trying to vectorize a basic-block mark the rest of the datarefs
3825 as not vectorizable and truncate the vector of datarefs. That
3826 avoids spending useless time in analyzing their dependence. */
3827 if (i != datarefs.length ())
3829 gcc_assert (is_a <bb_vec_info> (vinfo));
3830 for (unsigned j = i; j < datarefs.length (); ++j)
3832 data_reference_p dr = datarefs[j];
3833 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3834 free_data_ref (dr);
3836 datarefs.truncate (i);
3839 return true;
3843 /* Function vect_get_new_vect_var.
3845 Returns a name for a new variable. The current naming scheme appends the
3846 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3847 the name of vectorizer generated variables, and appends that to NAME if
3848 provided. */
3850 tree
3851 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3853 const char *prefix;
3854 tree new_vect_var;
3856 switch (var_kind)
3858 case vect_simple_var:
3859 prefix = "vect";
3860 break;
3861 case vect_scalar_var:
3862 prefix = "stmp";
3863 break;
3864 case vect_mask_var:
3865 prefix = "mask";
3866 break;
3867 case vect_pointer_var:
3868 prefix = "vectp";
3869 break;
3870 default:
3871 gcc_unreachable ();
3874 if (name)
3876 char* tmp = concat (prefix, "_", name, NULL);
3877 new_vect_var = create_tmp_reg (type, tmp);
3878 free (tmp);
3880 else
3881 new_vect_var = create_tmp_reg (type, prefix);
3883 return new_vect_var;
3886 /* Like vect_get_new_vect_var but return an SSA name. */
3888 tree
3889 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3891 const char *prefix;
3892 tree new_vect_var;
3894 switch (var_kind)
3896 case vect_simple_var:
3897 prefix = "vect";
3898 break;
3899 case vect_scalar_var:
3900 prefix = "stmp";
3901 break;
3902 case vect_pointer_var:
3903 prefix = "vectp";
3904 break;
3905 default:
3906 gcc_unreachable ();
3909 if (name)
3911 char* tmp = concat (prefix, "_", name, NULL);
3912 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3913 free (tmp);
3915 else
3916 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3918 return new_vect_var;
3921 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3923 static void
3924 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3925 stmt_vec_info stmt_info)
3927 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3928 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3929 int misalign = DR_MISALIGNMENT (dr);
3930 if (misalign == DR_MISALIGNMENT_UNKNOWN)
3931 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3932 else
3933 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3936 /* Function vect_create_addr_base_for_vector_ref.
3938 Create an expression that computes the address of the first memory location
3939 that will be accessed for a data reference.
3941 Input:
3942 STMT: The statement containing the data reference.
3943 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3944 OFFSET: Optional. If supplied, it is be added to the initial address.
3945 LOOP: Specify relative to which loop-nest should the address be computed.
3946 For example, when the dataref is in an inner-loop nested in an
3947 outer-loop that is now being vectorized, LOOP can be either the
3948 outer-loop, or the inner-loop. The first memory location accessed
3949 by the following dataref ('in' points to short):
3951 for (i=0; i<N; i++)
3952 for (j=0; j<M; j++)
3953 s += in[i+j]
3955 is as follows:
3956 if LOOP=i_loop: &in (relative to i_loop)
3957 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3958 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3959 initial address. Unlike OFFSET, which is number of elements to
3960 be added, BYTE_OFFSET is measured in bytes.
3962 Output:
3963 1. Return an SSA_NAME whose value is the address of the memory location of
3964 the first vector of the data reference.
3965 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3966 these statement(s) which define the returned SSA_NAME.
3968 FORNOW: We are only handling array accesses with step 1. */
3970 tree
3971 vect_create_addr_base_for_vector_ref (gimple *stmt,
3972 gimple_seq *new_stmt_list,
3973 tree offset,
3974 struct loop *loop,
3975 tree byte_offset)
3977 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3978 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3979 tree data_ref_base;
3980 const char *base_name;
3981 tree addr_base;
3982 tree dest;
3983 gimple_seq seq = NULL;
3984 tree base_offset;
3985 tree init;
3986 tree vect_ptr_type;
3987 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3988 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3990 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3992 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3994 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3996 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3997 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3998 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4000 else
4002 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4003 base_offset = unshare_expr (DR_OFFSET (dr));
4004 init = unshare_expr (DR_INIT (dr));
4007 if (loop_vinfo)
4008 base_name = get_name (data_ref_base);
4009 else
4011 base_offset = ssize_int (0);
4012 init = ssize_int (0);
4013 base_name = get_name (DR_REF (dr));
4016 /* Create base_offset */
4017 base_offset = size_binop (PLUS_EXPR,
4018 fold_convert (sizetype, base_offset),
4019 fold_convert (sizetype, init));
4021 if (offset)
4023 offset = fold_build2 (MULT_EXPR, sizetype,
4024 fold_convert (sizetype, offset), step);
4025 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4026 base_offset, offset);
4028 if (byte_offset)
4030 byte_offset = fold_convert (sizetype, byte_offset);
4031 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4032 base_offset, byte_offset);
4035 /* base + base_offset */
4036 if (loop_vinfo)
4037 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4038 else
4040 addr_base = build1 (ADDR_EXPR,
4041 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4042 unshare_expr (DR_REF (dr)));
4045 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4046 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4047 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4048 gimple_seq_add_seq (new_stmt_list, seq);
4050 if (DR_PTR_INFO (dr)
4051 && TREE_CODE (addr_base) == SSA_NAME
4052 && !SSA_NAME_PTR_INFO (addr_base))
4054 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4055 if (offset || byte_offset)
4056 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4059 if (dump_enabled_p ())
4061 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4062 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4063 dump_printf (MSG_NOTE, "\n");
4066 return addr_base;
4070 /* Function vect_create_data_ref_ptr.
4072 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4073 location accessed in the loop by STMT, along with the def-use update
4074 chain to appropriately advance the pointer through the loop iterations.
4075 Also set aliasing information for the pointer. This pointer is used by
4076 the callers to this function to create a memory reference expression for
4077 vector load/store access.
4079 Input:
4080 1. STMT: a stmt that references memory. Expected to be of the form
4081 GIMPLE_ASSIGN <name, data-ref> or
4082 GIMPLE_ASSIGN <data-ref, name>.
4083 2. AGGR_TYPE: the type of the reference, which should be either a vector
4084 or an array.
4085 3. AT_LOOP: the loop where the vector memref is to be created.
4086 4. OFFSET (optional): an offset to be added to the initial address accessed
4087 by the data-ref in STMT.
4088 5. BSI: location where the new stmts are to be placed if there is no loop
4089 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4090 pointing to the initial address.
4091 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4092 to the initial address accessed by the data-ref in STMT. This is
4093 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4094 in bytes.
4096 Output:
4097 1. Declare a new ptr to vector_type, and have it point to the base of the
4098 data reference (initial addressed accessed by the data reference).
4099 For example, for vector of type V8HI, the following code is generated:
4101 v8hi *ap;
4102 ap = (v8hi *)initial_address;
4104 if OFFSET is not supplied:
4105 initial_address = &a[init];
4106 if OFFSET is supplied:
4107 initial_address = &a[init + OFFSET];
4108 if BYTE_OFFSET is supplied:
4109 initial_address = &a[init] + BYTE_OFFSET;
4111 Return the initial_address in INITIAL_ADDRESS.
4113 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4114 update the pointer in each iteration of the loop.
4116 Return the increment stmt that updates the pointer in PTR_INCR.
4118 3. Set INV_P to true if the access pattern of the data reference in the
4119 vectorized loop is invariant. Set it to false otherwise.
4121 4. Return the pointer. */
4123 tree
4124 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4125 tree offset, tree *initial_address,
4126 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4127 bool only_init, bool *inv_p, tree byte_offset)
4129 const char *base_name;
4130 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4131 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4132 struct loop *loop = NULL;
4133 bool nested_in_vect_loop = false;
4134 struct loop *containing_loop = NULL;
4135 tree aggr_ptr_type;
4136 tree aggr_ptr;
4137 tree new_temp;
4138 gimple_seq new_stmt_list = NULL;
4139 edge pe = NULL;
4140 basic_block new_bb;
4141 tree aggr_ptr_init;
4142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4143 tree aptr;
4144 gimple_stmt_iterator incr_gsi;
4145 bool insert_after;
4146 tree indx_before_incr, indx_after_incr;
4147 gimple *incr;
4148 tree step;
4149 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4151 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4152 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4154 if (loop_vinfo)
4156 loop = LOOP_VINFO_LOOP (loop_vinfo);
4157 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4158 containing_loop = (gimple_bb (stmt))->loop_father;
4159 pe = loop_preheader_edge (loop);
4161 else
4163 gcc_assert (bb_vinfo);
4164 only_init = true;
4165 *ptr_incr = NULL;
4168 /* Check the step (evolution) of the load in LOOP, and record
4169 whether it's invariant. */
4170 if (nested_in_vect_loop)
4171 step = STMT_VINFO_DR_STEP (stmt_info);
4172 else
4173 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4175 if (integer_zerop (step))
4176 *inv_p = true;
4177 else
4178 *inv_p = false;
4180 /* Create an expression for the first address accessed by this load
4181 in LOOP. */
4182 base_name = get_name (DR_BASE_ADDRESS (dr));
4184 if (dump_enabled_p ())
4186 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4187 dump_printf_loc (MSG_NOTE, vect_location,
4188 "create %s-pointer variable to type: ",
4189 get_tree_code_name (TREE_CODE (aggr_type)));
4190 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4191 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4192 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4193 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4194 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4195 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4196 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4197 else
4198 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4199 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4200 dump_printf (MSG_NOTE, "\n");
4203 /* (1) Create the new aggregate-pointer variable.
4204 Vector and array types inherit the alias set of their component
4205 type by default so we need to use a ref-all pointer if the data
4206 reference does not conflict with the created aggregated data
4207 reference because it is not addressable. */
4208 bool need_ref_all = false;
4209 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4210 get_alias_set (DR_REF (dr))))
4211 need_ref_all = true;
4212 /* Likewise for any of the data references in the stmt group. */
4213 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4215 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4218 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4219 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4220 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4221 get_alias_set (DR_REF (sdr))))
4223 need_ref_all = true;
4224 break;
4226 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4228 while (orig_stmt);
4230 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4231 need_ref_all);
4232 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4235 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4236 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4237 def-use update cycles for the pointer: one relative to the outer-loop
4238 (LOOP), which is what steps (3) and (4) below do. The other is relative
4239 to the inner-loop (which is the inner-most loop containing the dataref),
4240 and this is done be step (5) below.
4242 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4243 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4244 redundant. Steps (3),(4) create the following:
4246 vp0 = &base_addr;
4247 LOOP: vp1 = phi(vp0,vp2)
4250 vp2 = vp1 + step
4251 goto LOOP
4253 If there is an inner-loop nested in loop, then step (5) will also be
4254 applied, and an additional update in the inner-loop will be created:
4256 vp0 = &base_addr;
4257 LOOP: vp1 = phi(vp0,vp2)
4259 inner: vp3 = phi(vp1,vp4)
4260 vp4 = vp3 + inner_step
4261 if () goto inner
4263 vp2 = vp1 + step
4264 if () goto LOOP */
4266 /* (2) Calculate the initial address of the aggregate-pointer, and set
4267 the aggregate-pointer to point to it before the loop. */
4269 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4271 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4272 offset, loop, byte_offset);
4273 if (new_stmt_list)
4275 if (pe)
4277 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4278 gcc_assert (!new_bb);
4280 else
4281 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4284 *initial_address = new_temp;
4285 aggr_ptr_init = new_temp;
4287 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4288 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4289 inner-loop nested in LOOP (during outer-loop vectorization). */
4291 /* No update in loop is required. */
4292 if (only_init && (!loop_vinfo || at_loop == loop))
4293 aptr = aggr_ptr_init;
4294 else
4296 /* The step of the aggregate pointer is the type size. */
4297 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4298 /* One exception to the above is when the scalar step of the load in
4299 LOOP is zero. In this case the step here is also zero. */
4300 if (*inv_p)
4301 iv_step = size_zero_node;
4302 else if (tree_int_cst_sgn (step) == -1)
4303 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4305 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4307 create_iv (aggr_ptr_init,
4308 fold_convert (aggr_ptr_type, iv_step),
4309 aggr_ptr, loop, &incr_gsi, insert_after,
4310 &indx_before_incr, &indx_after_incr);
4311 incr = gsi_stmt (incr_gsi);
4312 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4314 /* Copy the points-to information if it exists. */
4315 if (DR_PTR_INFO (dr))
4317 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4318 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4320 if (ptr_incr)
4321 *ptr_incr = incr;
4323 aptr = indx_before_incr;
4326 if (!nested_in_vect_loop || only_init)
4327 return aptr;
4330 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4331 nested in LOOP, if exists. */
4333 gcc_assert (nested_in_vect_loop);
4334 if (!only_init)
4336 standard_iv_increment_position (containing_loop, &incr_gsi,
4337 &insert_after);
4338 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4339 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4340 &indx_after_incr);
4341 incr = gsi_stmt (incr_gsi);
4342 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4344 /* Copy the points-to information if it exists. */
4345 if (DR_PTR_INFO (dr))
4347 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4348 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4350 if (ptr_incr)
4351 *ptr_incr = incr;
4353 return indx_before_incr;
4355 else
4356 gcc_unreachable ();
4360 /* Function bump_vector_ptr
4362 Increment a pointer (to a vector type) by vector-size. If requested,
4363 i.e. if PTR-INCR is given, then also connect the new increment stmt
4364 to the existing def-use update-chain of the pointer, by modifying
4365 the PTR_INCR as illustrated below:
4367 The pointer def-use update-chain before this function:
4368 DATAREF_PTR = phi (p_0, p_2)
4369 ....
4370 PTR_INCR: p_2 = DATAREF_PTR + step
4372 The pointer def-use update-chain after this function:
4373 DATAREF_PTR = phi (p_0, p_2)
4374 ....
4375 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4376 ....
4377 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4379 Input:
4380 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4381 in the loop.
4382 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4383 the loop. The increment amount across iterations is expected
4384 to be vector_size.
4385 BSI - location where the new update stmt is to be placed.
4386 STMT - the original scalar memory-access stmt that is being vectorized.
4387 BUMP - optional. The offset by which to bump the pointer. If not given,
4388 the offset is assumed to be vector_size.
4390 Output: Return NEW_DATAREF_PTR as illustrated above.
4394 tree
4395 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4396 gimple *stmt, tree bump)
4398 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4399 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4400 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4401 tree update = TYPE_SIZE_UNIT (vectype);
4402 gassign *incr_stmt;
4403 ssa_op_iter iter;
4404 use_operand_p use_p;
4405 tree new_dataref_ptr;
4407 if (bump)
4408 update = bump;
4410 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4411 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4412 else
4413 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4414 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4415 dataref_ptr, update);
4416 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4418 /* Copy the points-to information if it exists. */
4419 if (DR_PTR_INFO (dr))
4421 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4422 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4425 if (!ptr_incr)
4426 return new_dataref_ptr;
4428 /* Update the vector-pointer's cross-iteration increment. */
4429 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4431 tree use = USE_FROM_PTR (use_p);
4433 if (use == dataref_ptr)
4434 SET_USE (use_p, new_dataref_ptr);
4435 else
4436 gcc_assert (tree_int_cst_compare (use, update) == 0);
4439 return new_dataref_ptr;
4443 /* Function vect_create_destination_var.
4445 Create a new temporary of type VECTYPE. */
4447 tree
4448 vect_create_destination_var (tree scalar_dest, tree vectype)
4450 tree vec_dest;
4451 const char *name;
4452 char *new_name;
4453 tree type;
4454 enum vect_var_kind kind;
4456 kind = vectype
4457 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4458 ? vect_mask_var
4459 : vect_simple_var
4460 : vect_scalar_var;
4461 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4463 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4465 name = get_name (scalar_dest);
4466 if (name)
4467 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4468 else
4469 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4470 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4471 free (new_name);
4473 return vec_dest;
4476 /* Function vect_grouped_store_supported.
4478 Returns TRUE if interleave high and interleave low permutations
4479 are supported, and FALSE otherwise. */
4481 bool
4482 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4484 machine_mode mode = TYPE_MODE (vectype);
4486 /* vect_permute_store_chain requires the group size to be equal to 3 or
4487 be a power of two. */
4488 if (count != 3 && exact_log2 (count) == -1)
4490 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4492 "the size of the group of accesses"
4493 " is not a power of 2 or not eqaul to 3\n");
4494 return false;
4497 /* Check that the permutation is supported. */
4498 if (VECTOR_MODE_P (mode))
4500 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4501 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4503 if (count == 3)
4505 unsigned int j0 = 0, j1 = 0, j2 = 0;
4506 unsigned int i, j;
4508 for (j = 0; j < 3; j++)
4510 int nelt0 = ((3 - j) * nelt) % 3;
4511 int nelt1 = ((3 - j) * nelt + 1) % 3;
4512 int nelt2 = ((3 - j) * nelt + 2) % 3;
4513 for (i = 0; i < nelt; i++)
4515 if (3 * i + nelt0 < nelt)
4516 sel[3 * i + nelt0] = j0++;
4517 if (3 * i + nelt1 < nelt)
4518 sel[3 * i + nelt1] = nelt + j1++;
4519 if (3 * i + nelt2 < nelt)
4520 sel[3 * i + nelt2] = 0;
4522 if (!can_vec_perm_p (mode, false, sel))
4524 if (dump_enabled_p ())
4525 dump_printf (MSG_MISSED_OPTIMIZATION,
4526 "permutaion op not supported by target.\n");
4527 return false;
4530 for (i = 0; i < nelt; i++)
4532 if (3 * i + nelt0 < nelt)
4533 sel[3 * i + nelt0] = 3 * i + nelt0;
4534 if (3 * i + nelt1 < nelt)
4535 sel[3 * i + nelt1] = 3 * i + nelt1;
4536 if (3 * i + nelt2 < nelt)
4537 sel[3 * i + nelt2] = nelt + j2++;
4539 if (!can_vec_perm_p (mode, false, sel))
4541 if (dump_enabled_p ())
4542 dump_printf (MSG_MISSED_OPTIMIZATION,
4543 "permutaion op not supported by target.\n");
4544 return false;
4547 return true;
4549 else
4551 /* If length is not equal to 3 then only power of 2 is supported. */
4552 gcc_assert (pow2p_hwi (count));
4554 for (i = 0; i < nelt / 2; i++)
4556 sel[i * 2] = i;
4557 sel[i * 2 + 1] = i + nelt;
4559 if (can_vec_perm_p (mode, false, sel))
4561 for (i = 0; i < nelt; i++)
4562 sel[i] += nelt / 2;
4563 if (can_vec_perm_p (mode, false, sel))
4564 return true;
4569 if (dump_enabled_p ())
4570 dump_printf (MSG_MISSED_OPTIMIZATION,
4571 "permutaion op not supported by target.\n");
4572 return false;
4576 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4577 type VECTYPE. */
4579 bool
4580 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4582 return vect_lanes_optab_supported_p ("vec_store_lanes",
4583 vec_store_lanes_optab,
4584 vectype, count);
4588 /* Function vect_permute_store_chain.
4590 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4591 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4592 the data correctly for the stores. Return the final references for stores
4593 in RESULT_CHAIN.
4595 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4596 The input is 4 vectors each containing 8 elements. We assign a number to
4597 each element, the input sequence is:
4599 1st vec: 0 1 2 3 4 5 6 7
4600 2nd vec: 8 9 10 11 12 13 14 15
4601 3rd vec: 16 17 18 19 20 21 22 23
4602 4th vec: 24 25 26 27 28 29 30 31
4604 The output sequence should be:
4606 1st vec: 0 8 16 24 1 9 17 25
4607 2nd vec: 2 10 18 26 3 11 19 27
4608 3rd vec: 4 12 20 28 5 13 21 30
4609 4th vec: 6 14 22 30 7 15 23 31
4611 i.e., we interleave the contents of the four vectors in their order.
4613 We use interleave_high/low instructions to create such output. The input of
4614 each interleave_high/low operation is two vectors:
4615 1st vec 2nd vec
4616 0 1 2 3 4 5 6 7
4617 the even elements of the result vector are obtained left-to-right from the
4618 high/low elements of the first vector. The odd elements of the result are
4619 obtained left-to-right from the high/low elements of the second vector.
4620 The output of interleave_high will be: 0 4 1 5
4621 and of interleave_low: 2 6 3 7
4624 The permutation is done in log LENGTH stages. In each stage interleave_high
4625 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4626 where the first argument is taken from the first half of DR_CHAIN and the
4627 second argument from it's second half.
4628 In our example,
4630 I1: interleave_high (1st vec, 3rd vec)
4631 I2: interleave_low (1st vec, 3rd vec)
4632 I3: interleave_high (2nd vec, 4th vec)
4633 I4: interleave_low (2nd vec, 4th vec)
4635 The output for the first stage is:
4637 I1: 0 16 1 17 2 18 3 19
4638 I2: 4 20 5 21 6 22 7 23
4639 I3: 8 24 9 25 10 26 11 27
4640 I4: 12 28 13 29 14 30 15 31
4642 The output of the second stage, i.e. the final result is:
4644 I1: 0 8 16 24 1 9 17 25
4645 I2: 2 10 18 26 3 11 19 27
4646 I3: 4 12 20 28 5 13 21 30
4647 I4: 6 14 22 30 7 15 23 31. */
4649 void
4650 vect_permute_store_chain (vec<tree> dr_chain,
4651 unsigned int length,
4652 gimple *stmt,
4653 gimple_stmt_iterator *gsi,
4654 vec<tree> *result_chain)
4656 tree vect1, vect2, high, low;
4657 gimple *perm_stmt;
4658 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4659 tree perm_mask_low, perm_mask_high;
4660 tree data_ref;
4661 tree perm3_mask_low, perm3_mask_high;
4662 unsigned int i, n, log_length = exact_log2 (length);
4663 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4664 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4666 result_chain->quick_grow (length);
4667 memcpy (result_chain->address (), dr_chain.address (),
4668 length * sizeof (tree));
4670 if (length == 3)
4672 unsigned int j0 = 0, j1 = 0, j2 = 0;
4674 for (j = 0; j < 3; j++)
4676 int nelt0 = ((3 - j) * nelt) % 3;
4677 int nelt1 = ((3 - j) * nelt + 1) % 3;
4678 int nelt2 = ((3 - j) * nelt + 2) % 3;
4680 for (i = 0; i < nelt; i++)
4682 if (3 * i + nelt0 < nelt)
4683 sel[3 * i + nelt0] = j0++;
4684 if (3 * i + nelt1 < nelt)
4685 sel[3 * i + nelt1] = nelt + j1++;
4686 if (3 * i + nelt2 < nelt)
4687 sel[3 * i + nelt2] = 0;
4689 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4691 for (i = 0; i < nelt; i++)
4693 if (3 * i + nelt0 < nelt)
4694 sel[3 * i + nelt0] = 3 * i + nelt0;
4695 if (3 * i + nelt1 < nelt)
4696 sel[3 * i + nelt1] = 3 * i + nelt1;
4697 if (3 * i + nelt2 < nelt)
4698 sel[3 * i + nelt2] = nelt + j2++;
4700 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4702 vect1 = dr_chain[0];
4703 vect2 = dr_chain[1];
4705 /* Create interleaving stmt:
4706 low = VEC_PERM_EXPR <vect1, vect2,
4707 {j, nelt, *, j + 1, nelt + j + 1, *,
4708 j + 2, nelt + j + 2, *, ...}> */
4709 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4710 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4711 vect2, perm3_mask_low);
4712 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4714 vect1 = data_ref;
4715 vect2 = dr_chain[2];
4716 /* Create interleaving stmt:
4717 low = VEC_PERM_EXPR <vect1, vect2,
4718 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4719 6, 7, nelt + j + 2, ...}> */
4720 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4721 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4722 vect2, perm3_mask_high);
4723 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4724 (*result_chain)[j] = data_ref;
4727 else
4729 /* If length is not equal to 3 then only power of 2 is supported. */
4730 gcc_assert (pow2p_hwi (length));
4732 for (i = 0, n = nelt / 2; i < n; i++)
4734 sel[i * 2] = i;
4735 sel[i * 2 + 1] = i + nelt;
4737 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4739 for (i = 0; i < nelt; i++)
4740 sel[i] += nelt / 2;
4741 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4743 for (i = 0, n = log_length; i < n; i++)
4745 for (j = 0; j < length/2; j++)
4747 vect1 = dr_chain[j];
4748 vect2 = dr_chain[j+length/2];
4750 /* Create interleaving stmt:
4751 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4752 ...}> */
4753 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4754 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4755 vect2, perm_mask_high);
4756 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4757 (*result_chain)[2*j] = high;
4759 /* Create interleaving stmt:
4760 low = VEC_PERM_EXPR <vect1, vect2,
4761 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4762 ...}> */
4763 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4764 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4765 vect2, perm_mask_low);
4766 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4767 (*result_chain)[2*j+1] = low;
4769 memcpy (dr_chain.address (), result_chain->address (),
4770 length * sizeof (tree));
4775 /* Function vect_setup_realignment
4777 This function is called when vectorizing an unaligned load using
4778 the dr_explicit_realign[_optimized] scheme.
4779 This function generates the following code at the loop prolog:
4781 p = initial_addr;
4782 x msq_init = *(floor(p)); # prolog load
4783 realignment_token = call target_builtin;
4784 loop:
4785 x msq = phi (msq_init, ---)
4787 The stmts marked with x are generated only for the case of
4788 dr_explicit_realign_optimized.
4790 The code above sets up a new (vector) pointer, pointing to the first
4791 location accessed by STMT, and a "floor-aligned" load using that pointer.
4792 It also generates code to compute the "realignment-token" (if the relevant
4793 target hook was defined), and creates a phi-node at the loop-header bb
4794 whose arguments are the result of the prolog-load (created by this
4795 function) and the result of a load that takes place in the loop (to be
4796 created by the caller to this function).
4798 For the case of dr_explicit_realign_optimized:
4799 The caller to this function uses the phi-result (msq) to create the
4800 realignment code inside the loop, and sets up the missing phi argument,
4801 as follows:
4802 loop:
4803 msq = phi (msq_init, lsq)
4804 lsq = *(floor(p')); # load in loop
4805 result = realign_load (msq, lsq, realignment_token);
4807 For the case of dr_explicit_realign:
4808 loop:
4809 msq = *(floor(p)); # load in loop
4810 p' = p + (VS-1);
4811 lsq = *(floor(p')); # load in loop
4812 result = realign_load (msq, lsq, realignment_token);
4814 Input:
4815 STMT - (scalar) load stmt to be vectorized. This load accesses
4816 a memory location that may be unaligned.
4817 BSI - place where new code is to be inserted.
4818 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4819 is used.
4821 Output:
4822 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4823 target hook, if defined.
4824 Return value - the result of the loop-header phi node. */
4826 tree
4827 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4828 tree *realignment_token,
4829 enum dr_alignment_support alignment_support_scheme,
4830 tree init_addr,
4831 struct loop **at_loop)
4833 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4834 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4835 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4836 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4837 struct loop *loop = NULL;
4838 edge pe = NULL;
4839 tree scalar_dest = gimple_assign_lhs (stmt);
4840 tree vec_dest;
4841 gimple *inc;
4842 tree ptr;
4843 tree data_ref;
4844 basic_block new_bb;
4845 tree msq_init = NULL_TREE;
4846 tree new_temp;
4847 gphi *phi_stmt;
4848 tree msq = NULL_TREE;
4849 gimple_seq stmts = NULL;
4850 bool inv_p;
4851 bool compute_in_loop = false;
4852 bool nested_in_vect_loop = false;
4853 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4854 struct loop *loop_for_initial_load = NULL;
4856 if (loop_vinfo)
4858 loop = LOOP_VINFO_LOOP (loop_vinfo);
4859 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4862 gcc_assert (alignment_support_scheme == dr_explicit_realign
4863 || alignment_support_scheme == dr_explicit_realign_optimized);
4865 /* We need to generate three things:
4866 1. the misalignment computation
4867 2. the extra vector load (for the optimized realignment scheme).
4868 3. the phi node for the two vectors from which the realignment is
4869 done (for the optimized realignment scheme). */
4871 /* 1. Determine where to generate the misalignment computation.
4873 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4874 calculation will be generated by this function, outside the loop (in the
4875 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4876 caller, inside the loop.
4878 Background: If the misalignment remains fixed throughout the iterations of
4879 the loop, then both realignment schemes are applicable, and also the
4880 misalignment computation can be done outside LOOP. This is because we are
4881 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4882 are a multiple of VS (the Vector Size), and therefore the misalignment in
4883 different vectorized LOOP iterations is always the same.
4884 The problem arises only if the memory access is in an inner-loop nested
4885 inside LOOP, which is now being vectorized using outer-loop vectorization.
4886 This is the only case when the misalignment of the memory access may not
4887 remain fixed throughout the iterations of the inner-loop (as explained in
4888 detail in vect_supportable_dr_alignment). In this case, not only is the
4889 optimized realignment scheme not applicable, but also the misalignment
4890 computation (and generation of the realignment token that is passed to
4891 REALIGN_LOAD) have to be done inside the loop.
4893 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4894 or not, which in turn determines if the misalignment is computed inside
4895 the inner-loop, or outside LOOP. */
4897 if (init_addr != NULL_TREE || !loop_vinfo)
4899 compute_in_loop = true;
4900 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4904 /* 2. Determine where to generate the extra vector load.
4906 For the optimized realignment scheme, instead of generating two vector
4907 loads in each iteration, we generate a single extra vector load in the
4908 preheader of the loop, and in each iteration reuse the result of the
4909 vector load from the previous iteration. In case the memory access is in
4910 an inner-loop nested inside LOOP, which is now being vectorized using
4911 outer-loop vectorization, we need to determine whether this initial vector
4912 load should be generated at the preheader of the inner-loop, or can be
4913 generated at the preheader of LOOP. If the memory access has no evolution
4914 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4915 to be generated inside LOOP (in the preheader of the inner-loop). */
4917 if (nested_in_vect_loop)
4919 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4920 bool invariant_in_outerloop =
4921 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4922 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4924 else
4925 loop_for_initial_load = loop;
4926 if (at_loop)
4927 *at_loop = loop_for_initial_load;
4929 if (loop_for_initial_load)
4930 pe = loop_preheader_edge (loop_for_initial_load);
4932 /* 3. For the case of the optimized realignment, create the first vector
4933 load at the loop preheader. */
4935 if (alignment_support_scheme == dr_explicit_realign_optimized)
4937 /* Create msq_init = *(floor(p1)) in the loop preheader */
4938 gassign *new_stmt;
4940 gcc_assert (!compute_in_loop);
4941 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4942 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4943 NULL_TREE, &init_addr, NULL, &inc,
4944 true, &inv_p);
4945 if (TREE_CODE (ptr) == SSA_NAME)
4946 new_temp = copy_ssa_name (ptr);
4947 else
4948 new_temp = make_ssa_name (TREE_TYPE (ptr));
4949 new_stmt = gimple_build_assign
4950 (new_temp, BIT_AND_EXPR, ptr,
4951 build_int_cst (TREE_TYPE (ptr),
4952 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4953 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4954 gcc_assert (!new_bb);
4955 data_ref
4956 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4957 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4958 new_stmt = gimple_build_assign (vec_dest, data_ref);
4959 new_temp = make_ssa_name (vec_dest, new_stmt);
4960 gimple_assign_set_lhs (new_stmt, new_temp);
4961 if (pe)
4963 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4964 gcc_assert (!new_bb);
4966 else
4967 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4969 msq_init = gimple_assign_lhs (new_stmt);
4972 /* 4. Create realignment token using a target builtin, if available.
4973 It is done either inside the containing loop, or before LOOP (as
4974 determined above). */
4976 if (targetm.vectorize.builtin_mask_for_load)
4978 gcall *new_stmt;
4979 tree builtin_decl;
4981 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4982 if (!init_addr)
4984 /* Generate the INIT_ADDR computation outside LOOP. */
4985 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4986 NULL_TREE, loop);
4987 if (loop)
4989 pe = loop_preheader_edge (loop);
4990 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4991 gcc_assert (!new_bb);
4993 else
4994 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4997 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4998 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4999 vec_dest =
5000 vect_create_destination_var (scalar_dest,
5001 gimple_call_return_type (new_stmt));
5002 new_temp = make_ssa_name (vec_dest, new_stmt);
5003 gimple_call_set_lhs (new_stmt, new_temp);
5005 if (compute_in_loop)
5006 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5007 else
5009 /* Generate the misalignment computation outside LOOP. */
5010 pe = loop_preheader_edge (loop);
5011 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5012 gcc_assert (!new_bb);
5015 *realignment_token = gimple_call_lhs (new_stmt);
5017 /* The result of the CALL_EXPR to this builtin is determined from
5018 the value of the parameter and no global variables are touched
5019 which makes the builtin a "const" function. Requiring the
5020 builtin to have the "const" attribute makes it unnecessary
5021 to call mark_call_clobbered. */
5022 gcc_assert (TREE_READONLY (builtin_decl));
5025 if (alignment_support_scheme == dr_explicit_realign)
5026 return msq;
5028 gcc_assert (!compute_in_loop);
5029 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5032 /* 5. Create msq = phi <msq_init, lsq> in loop */
5034 pe = loop_preheader_edge (containing_loop);
5035 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5036 msq = make_ssa_name (vec_dest);
5037 phi_stmt = create_phi_node (msq, containing_loop->header);
5038 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5040 return msq;
5044 /* Function vect_grouped_load_supported.
5046 COUNT is the size of the load group (the number of statements plus the
5047 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5048 only one statement, with a gap of COUNT - 1.
5050 Returns true if a suitable permute exists. */
5052 bool
5053 vect_grouped_load_supported (tree vectype, bool single_element_p,
5054 unsigned HOST_WIDE_INT count)
5056 machine_mode mode = TYPE_MODE (vectype);
5058 /* If this is single-element interleaving with an element distance
5059 that leaves unused vector loads around punt - we at least create
5060 very sub-optimal code in that case (and blow up memory,
5061 see PR65518). */
5062 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5064 if (dump_enabled_p ())
5065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5066 "single-element interleaving not supported "
5067 "for not adjacent vector loads\n");
5068 return false;
5071 /* vect_permute_load_chain requires the group size to be equal to 3 or
5072 be a power of two. */
5073 if (count != 3 && exact_log2 (count) == -1)
5075 if (dump_enabled_p ())
5076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5077 "the size of the group of accesses"
5078 " is not a power of 2 or not equal to 3\n");
5079 return false;
5082 /* Check that the permutation is supported. */
5083 if (VECTOR_MODE_P (mode))
5085 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5086 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5088 if (count == 3)
5090 unsigned int k;
5091 for (k = 0; k < 3; k++)
5093 for (i = 0; i < nelt; i++)
5094 if (3 * i + k < 2 * nelt)
5095 sel[i] = 3 * i + k;
5096 else
5097 sel[i] = 0;
5098 if (!can_vec_perm_p (mode, false, sel))
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102 "shuffle of 3 loads is not supported by"
5103 " target\n");
5104 return false;
5106 for (i = 0, j = 0; i < nelt; i++)
5107 if (3 * i + k < 2 * nelt)
5108 sel[i] = i;
5109 else
5110 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5111 if (!can_vec_perm_p (mode, false, sel))
5113 if (dump_enabled_p ())
5114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5115 "shuffle of 3 loads is not supported by"
5116 " target\n");
5117 return false;
5120 return true;
5122 else
5124 /* If length is not equal to 3 then only power of 2 is supported. */
5125 gcc_assert (pow2p_hwi (count));
5126 for (i = 0; i < nelt; i++)
5127 sel[i] = i * 2;
5128 if (can_vec_perm_p (mode, false, sel))
5130 for (i = 0; i < nelt; i++)
5131 sel[i] = i * 2 + 1;
5132 if (can_vec_perm_p (mode, false, sel))
5133 return true;
5138 if (dump_enabled_p ())
5139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5140 "extract even/odd not supported by target\n");
5141 return false;
5144 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5145 type VECTYPE. */
5147 bool
5148 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5150 return vect_lanes_optab_supported_p ("vec_load_lanes",
5151 vec_load_lanes_optab,
5152 vectype, count);
5155 /* Function vect_permute_load_chain.
5157 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5158 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5159 the input data correctly. Return the final references for loads in
5160 RESULT_CHAIN.
5162 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5163 The input is 4 vectors each containing 8 elements. We assign a number to each
5164 element, the input sequence is:
5166 1st vec: 0 1 2 3 4 5 6 7
5167 2nd vec: 8 9 10 11 12 13 14 15
5168 3rd vec: 16 17 18 19 20 21 22 23
5169 4th vec: 24 25 26 27 28 29 30 31
5171 The output sequence should be:
5173 1st vec: 0 4 8 12 16 20 24 28
5174 2nd vec: 1 5 9 13 17 21 25 29
5175 3rd vec: 2 6 10 14 18 22 26 30
5176 4th vec: 3 7 11 15 19 23 27 31
5178 i.e., the first output vector should contain the first elements of each
5179 interleaving group, etc.
5181 We use extract_even/odd instructions to create such output. The input of
5182 each extract_even/odd operation is two vectors
5183 1st vec 2nd vec
5184 0 1 2 3 4 5 6 7
5186 and the output is the vector of extracted even/odd elements. The output of
5187 extract_even will be: 0 2 4 6
5188 and of extract_odd: 1 3 5 7
5191 The permutation is done in log LENGTH stages. In each stage extract_even
5192 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5193 their order. In our example,
5195 E1: extract_even (1st vec, 2nd vec)
5196 E2: extract_odd (1st vec, 2nd vec)
5197 E3: extract_even (3rd vec, 4th vec)
5198 E4: extract_odd (3rd vec, 4th vec)
5200 The output for the first stage will be:
5202 E1: 0 2 4 6 8 10 12 14
5203 E2: 1 3 5 7 9 11 13 15
5204 E3: 16 18 20 22 24 26 28 30
5205 E4: 17 19 21 23 25 27 29 31
5207 In order to proceed and create the correct sequence for the next stage (or
5208 for the correct output, if the second stage is the last one, as in our
5209 example), we first put the output of extract_even operation and then the
5210 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5211 The input for the second stage is:
5213 1st vec (E1): 0 2 4 6 8 10 12 14
5214 2nd vec (E3): 16 18 20 22 24 26 28 30
5215 3rd vec (E2): 1 3 5 7 9 11 13 15
5216 4th vec (E4): 17 19 21 23 25 27 29 31
5218 The output of the second stage:
5220 E1: 0 4 8 12 16 20 24 28
5221 E2: 2 6 10 14 18 22 26 30
5222 E3: 1 5 9 13 17 21 25 29
5223 E4: 3 7 11 15 19 23 27 31
5225 And RESULT_CHAIN after reordering:
5227 1st vec (E1): 0 4 8 12 16 20 24 28
5228 2nd vec (E3): 1 5 9 13 17 21 25 29
5229 3rd vec (E2): 2 6 10 14 18 22 26 30
5230 4th vec (E4): 3 7 11 15 19 23 27 31. */
5232 static void
5233 vect_permute_load_chain (vec<tree> dr_chain,
5234 unsigned int length,
5235 gimple *stmt,
5236 gimple_stmt_iterator *gsi,
5237 vec<tree> *result_chain)
5239 tree data_ref, first_vect, second_vect;
5240 tree perm_mask_even, perm_mask_odd;
5241 tree perm3_mask_low, perm3_mask_high;
5242 gimple *perm_stmt;
5243 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5244 unsigned int i, j, log_length = exact_log2 (length);
5245 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5246 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5248 result_chain->quick_grow (length);
5249 memcpy (result_chain->address (), dr_chain.address (),
5250 length * sizeof (tree));
5252 if (length == 3)
5254 unsigned int k;
5256 for (k = 0; k < 3; k++)
5258 for (i = 0; i < nelt; i++)
5259 if (3 * i + k < 2 * nelt)
5260 sel[i] = 3 * i + k;
5261 else
5262 sel[i] = 0;
5263 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5265 for (i = 0, j = 0; i < nelt; i++)
5266 if (3 * i + k < 2 * nelt)
5267 sel[i] = i;
5268 else
5269 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5271 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5273 first_vect = dr_chain[0];
5274 second_vect = dr_chain[1];
5276 /* Create interleaving stmt (low part of):
5277 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5278 ...}> */
5279 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5280 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5281 second_vect, perm3_mask_low);
5282 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5284 /* Create interleaving stmt (high part of):
5285 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5286 ...}> */
5287 first_vect = data_ref;
5288 second_vect = dr_chain[2];
5289 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5290 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5291 second_vect, perm3_mask_high);
5292 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5293 (*result_chain)[k] = data_ref;
5296 else
5298 /* If length is not equal to 3 then only power of 2 is supported. */
5299 gcc_assert (pow2p_hwi (length));
5301 for (i = 0; i < nelt; ++i)
5302 sel[i] = i * 2;
5303 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5305 for (i = 0; i < nelt; ++i)
5306 sel[i] = i * 2 + 1;
5307 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5309 for (i = 0; i < log_length; i++)
5311 for (j = 0; j < length; j += 2)
5313 first_vect = dr_chain[j];
5314 second_vect = dr_chain[j+1];
5316 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5317 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5318 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5319 first_vect, second_vect,
5320 perm_mask_even);
5321 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5322 (*result_chain)[j/2] = data_ref;
5324 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5325 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5326 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5327 first_vect, second_vect,
5328 perm_mask_odd);
5329 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5330 (*result_chain)[j/2+length/2] = data_ref;
5332 memcpy (dr_chain.address (), result_chain->address (),
5333 length * sizeof (tree));
5338 /* Function vect_shift_permute_load_chain.
5340 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5341 sequence of stmts to reorder the input data accordingly.
5342 Return the final references for loads in RESULT_CHAIN.
5343 Return true if successed, false otherwise.
5345 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5346 The input is 3 vectors each containing 8 elements. We assign a
5347 number to each element, the input sequence is:
5349 1st vec: 0 1 2 3 4 5 6 7
5350 2nd vec: 8 9 10 11 12 13 14 15
5351 3rd vec: 16 17 18 19 20 21 22 23
5353 The output sequence should be:
5355 1st vec: 0 3 6 9 12 15 18 21
5356 2nd vec: 1 4 7 10 13 16 19 22
5357 3rd vec: 2 5 8 11 14 17 20 23
5359 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5361 First we shuffle all 3 vectors to get correct elements order:
5363 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5364 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5365 3rd vec: (16 19 22) (17 20 23) (18 21)
5367 Next we unite and shift vector 3 times:
5369 1st step:
5370 shift right by 6 the concatenation of:
5371 "1st vec" and "2nd vec"
5372 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5373 "2nd vec" and "3rd vec"
5374 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5375 "3rd vec" and "1st vec"
5376 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5377 | New vectors |
5379 So that now new vectors are:
5381 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5382 2nd vec: (10 13) (16 19 22) (17 20 23)
5383 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5385 2nd step:
5386 shift right by 5 the concatenation of:
5387 "1st vec" and "3rd vec"
5388 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5389 "2nd vec" and "1st vec"
5390 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5391 "3rd vec" and "2nd vec"
5392 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5393 | New vectors |
5395 So that now new vectors are:
5397 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5398 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5399 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5401 3rd step:
5402 shift right by 5 the concatenation of:
5403 "1st vec" and "1st vec"
5404 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5405 shift right by 3 the concatenation of:
5406 "2nd vec" and "2nd vec"
5407 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5408 | New vectors |
5410 So that now all vectors are READY:
5411 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5412 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5413 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5415 This algorithm is faster than one in vect_permute_load_chain if:
5416 1. "shift of a concatination" is faster than general permutation.
5417 This is usually so.
5418 2. The TARGET machine can't execute vector instructions in parallel.
5419 This is because each step of the algorithm depends on previous.
5420 The algorithm in vect_permute_load_chain is much more parallel.
5422 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5425 static bool
5426 vect_shift_permute_load_chain (vec<tree> dr_chain,
5427 unsigned int length,
5428 gimple *stmt,
5429 gimple_stmt_iterator *gsi,
5430 vec<tree> *result_chain)
5432 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5433 tree perm2_mask1, perm2_mask2, perm3_mask;
5434 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5435 gimple *perm_stmt;
5437 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5438 unsigned int i;
5439 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5440 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5441 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5442 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5444 result_chain->quick_grow (length);
5445 memcpy (result_chain->address (), dr_chain.address (),
5446 length * sizeof (tree));
5448 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5450 unsigned int j, log_length = exact_log2 (length);
5451 for (i = 0; i < nelt / 2; ++i)
5452 sel[i] = i * 2;
5453 for (i = 0; i < nelt / 2; ++i)
5454 sel[nelt / 2 + i] = i * 2 + 1;
5455 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5457 if (dump_enabled_p ())
5458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5459 "shuffle of 2 fields structure is not \
5460 supported by target\n");
5461 return false;
5463 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5465 for (i = 0; i < nelt / 2; ++i)
5466 sel[i] = i * 2 + 1;
5467 for (i = 0; i < nelt / 2; ++i)
5468 sel[nelt / 2 + i] = i * 2;
5469 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5471 if (dump_enabled_p ())
5472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5473 "shuffle of 2 fields structure is not \
5474 supported by target\n");
5475 return false;
5477 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5479 /* Generating permutation constant to shift all elements.
5480 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5481 for (i = 0; i < nelt; i++)
5482 sel[i] = nelt / 2 + i;
5483 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5485 if (dump_enabled_p ())
5486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5487 "shift permutation is not supported by target\n");
5488 return false;
5490 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5492 /* Generating permutation constant to select vector from 2.
5493 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5494 for (i = 0; i < nelt / 2; i++)
5495 sel[i] = i;
5496 for (i = nelt / 2; i < nelt; i++)
5497 sel[i] = nelt + i;
5498 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5500 if (dump_enabled_p ())
5501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5502 "select is not supported by target\n");
5503 return false;
5505 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5507 for (i = 0; i < log_length; i++)
5509 for (j = 0; j < length; j += 2)
5511 first_vect = dr_chain[j];
5512 second_vect = dr_chain[j + 1];
5514 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5515 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5516 first_vect, first_vect,
5517 perm2_mask1);
5518 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5519 vect[0] = data_ref;
5521 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5522 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5523 second_vect, second_vect,
5524 perm2_mask2);
5525 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5526 vect[1] = data_ref;
5528 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5529 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5530 vect[0], vect[1], shift1_mask);
5531 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5532 (*result_chain)[j/2 + length/2] = data_ref;
5534 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5535 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5536 vect[0], vect[1], select_mask);
5537 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5538 (*result_chain)[j/2] = data_ref;
5540 memcpy (dr_chain.address (), result_chain->address (),
5541 length * sizeof (tree));
5543 return true;
5545 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5547 unsigned int k = 0, l = 0;
5549 /* Generating permutation constant to get all elements in rigth order.
5550 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5551 for (i = 0; i < nelt; i++)
5553 if (3 * k + (l % 3) >= nelt)
5555 k = 0;
5556 l += (3 - (nelt % 3));
5558 sel[i] = 3 * k + (l % 3);
5559 k++;
5561 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5563 if (dump_enabled_p ())
5564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5565 "shuffle of 3 fields structure is not \
5566 supported by target\n");
5567 return false;
5569 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5571 /* Generating permutation constant to shift all elements.
5572 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5573 for (i = 0; i < nelt; i++)
5574 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5575 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5577 if (dump_enabled_p ())
5578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5579 "shift permutation is not supported by target\n");
5580 return false;
5582 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5584 /* Generating permutation constant to shift all elements.
5585 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5586 for (i = 0; i < nelt; i++)
5587 sel[i] = 2 * (nelt / 3) + 1 + i;
5588 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5590 if (dump_enabled_p ())
5591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5592 "shift permutation is not supported by target\n");
5593 return false;
5595 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5597 /* Generating permutation constant to shift all elements.
5598 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5599 for (i = 0; i < nelt; i++)
5600 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5601 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5603 if (dump_enabled_p ())
5604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5605 "shift permutation is not supported by target\n");
5606 return false;
5608 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5610 /* Generating permutation constant to shift all elements.
5611 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5612 for (i = 0; i < nelt; i++)
5613 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5614 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5616 if (dump_enabled_p ())
5617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5618 "shift permutation is not supported by target\n");
5619 return false;
5621 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5623 for (k = 0; k < 3; k++)
5625 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5626 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5627 dr_chain[k], dr_chain[k],
5628 perm3_mask);
5629 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5630 vect[k] = data_ref;
5633 for (k = 0; k < 3; k++)
5635 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5636 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5637 vect[k % 3], vect[(k + 1) % 3],
5638 shift1_mask);
5639 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5640 vect_shift[k] = data_ref;
5643 for (k = 0; k < 3; k++)
5645 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5646 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5647 vect_shift[(4 - k) % 3],
5648 vect_shift[(3 - k) % 3],
5649 shift2_mask);
5650 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5651 vect[k] = data_ref;
5654 (*result_chain)[3 - (nelt % 3)] = vect[2];
5656 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5657 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5658 vect[0], shift3_mask);
5659 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5660 (*result_chain)[nelt % 3] = data_ref;
5662 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5663 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5664 vect[1], shift4_mask);
5665 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5666 (*result_chain)[0] = data_ref;
5667 return true;
5669 return false;
5672 /* Function vect_transform_grouped_load.
5674 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5675 to perform their permutation and ascribe the result vectorized statements to
5676 the scalar statements.
5679 void
5680 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5681 gimple_stmt_iterator *gsi)
5683 machine_mode mode;
5684 vec<tree> result_chain = vNULL;
5686 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5687 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5688 vectors, that are ready for vector computation. */
5689 result_chain.create (size);
5691 /* If reassociation width for vector type is 2 or greater target machine can
5692 execute 2 or more vector instructions in parallel. Otherwise try to
5693 get chain for loads group using vect_shift_permute_load_chain. */
5694 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5695 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5696 || pow2p_hwi (size)
5697 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5698 gsi, &result_chain))
5699 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5700 vect_record_grouped_load_vectors (stmt, result_chain);
5701 result_chain.release ();
5704 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5705 generated as part of the vectorization of STMT. Assign the statement
5706 for each vector to the associated scalar statement. */
5708 void
5709 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5711 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5712 gimple *next_stmt, *new_stmt;
5713 unsigned int i, gap_count;
5714 tree tmp_data_ref;
5716 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5717 Since we scan the chain starting from it's first node, their order
5718 corresponds the order of data-refs in RESULT_CHAIN. */
5719 next_stmt = first_stmt;
5720 gap_count = 1;
5721 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5723 if (!next_stmt)
5724 break;
5726 /* Skip the gaps. Loads created for the gaps will be removed by dead
5727 code elimination pass later. No need to check for the first stmt in
5728 the group, since it always exists.
5729 GROUP_GAP is the number of steps in elements from the previous
5730 access (if there is no gap GROUP_GAP is 1). We skip loads that
5731 correspond to the gaps. */
5732 if (next_stmt != first_stmt
5733 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5735 gap_count++;
5736 continue;
5739 while (next_stmt)
5741 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5742 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5743 copies, and we put the new vector statement in the first available
5744 RELATED_STMT. */
5745 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5746 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5747 else
5749 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5751 gimple *prev_stmt =
5752 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5753 gimple *rel_stmt =
5754 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5755 while (rel_stmt)
5757 prev_stmt = rel_stmt;
5758 rel_stmt =
5759 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5762 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5763 new_stmt;
5767 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5768 gap_count = 1;
5769 /* If NEXT_STMT accesses the same DR as the previous statement,
5770 put the same TMP_DATA_REF as its vectorized statement; otherwise
5771 get the next data-ref from RESULT_CHAIN. */
5772 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5773 break;
5778 /* Function vect_force_dr_alignment_p.
5780 Returns whether the alignment of a DECL can be forced to be aligned
5781 on ALIGNMENT bit boundary. */
5783 bool
5784 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5786 if (!VAR_P (decl))
5787 return false;
5789 if (decl_in_symtab_p (decl)
5790 && !symtab_node::get (decl)->can_increase_alignment_p ())
5791 return false;
5793 if (TREE_STATIC (decl))
5794 return (alignment <= MAX_OFILE_ALIGNMENT);
5795 else
5796 return (alignment <= MAX_STACK_ALIGNMENT);
5800 /* Return whether the data reference DR is supported with respect to its
5801 alignment.
5802 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5803 it is aligned, i.e., check if it is possible to vectorize it with different
5804 alignment. */
5806 enum dr_alignment_support
5807 vect_supportable_dr_alignment (struct data_reference *dr,
5808 bool check_aligned_accesses)
5810 gimple *stmt = DR_STMT (dr);
5811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5812 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5813 machine_mode mode = TYPE_MODE (vectype);
5814 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5815 struct loop *vect_loop = NULL;
5816 bool nested_in_vect_loop = false;
5818 if (aligned_access_p (dr) && !check_aligned_accesses)
5819 return dr_aligned;
5821 /* For now assume all conditional loads/stores support unaligned
5822 access without any special code. */
5823 if (is_gimple_call (stmt)
5824 && gimple_call_internal_p (stmt)
5825 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5826 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5827 return dr_unaligned_supported;
5829 if (loop_vinfo)
5831 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5832 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5835 /* Possibly unaligned access. */
5837 /* We can choose between using the implicit realignment scheme (generating
5838 a misaligned_move stmt) and the explicit realignment scheme (generating
5839 aligned loads with a REALIGN_LOAD). There are two variants to the
5840 explicit realignment scheme: optimized, and unoptimized.
5841 We can optimize the realignment only if the step between consecutive
5842 vector loads is equal to the vector size. Since the vector memory
5843 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5844 is guaranteed that the misalignment amount remains the same throughout the
5845 execution of the vectorized loop. Therefore, we can create the
5846 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5847 at the loop preheader.
5849 However, in the case of outer-loop vectorization, when vectorizing a
5850 memory access in the inner-loop nested within the LOOP that is now being
5851 vectorized, while it is guaranteed that the misalignment of the
5852 vectorized memory access will remain the same in different outer-loop
5853 iterations, it is *not* guaranteed that is will remain the same throughout
5854 the execution of the inner-loop. This is because the inner-loop advances
5855 with the original scalar step (and not in steps of VS). If the inner-loop
5856 step happens to be a multiple of VS, then the misalignment remains fixed
5857 and we can use the optimized realignment scheme. For example:
5859 for (i=0; i<N; i++)
5860 for (j=0; j<M; j++)
5861 s += a[i+j];
5863 When vectorizing the i-loop in the above example, the step between
5864 consecutive vector loads is 1, and so the misalignment does not remain
5865 fixed across the execution of the inner-loop, and the realignment cannot
5866 be optimized (as illustrated in the following pseudo vectorized loop):
5868 for (i=0; i<N; i+=4)
5869 for (j=0; j<M; j++){
5870 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5871 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5872 // (assuming that we start from an aligned address).
5875 We therefore have to use the unoptimized realignment scheme:
5877 for (i=0; i<N; i+=4)
5878 for (j=k; j<M; j+=4)
5879 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5880 // that the misalignment of the initial address is
5881 // 0).
5883 The loop can then be vectorized as follows:
5885 for (k=0; k<4; k++){
5886 rt = get_realignment_token (&vp[k]);
5887 for (i=0; i<N; i+=4){
5888 v1 = vp[i+k];
5889 for (j=k; j<M; j+=4){
5890 v2 = vp[i+j+VS-1];
5891 va = REALIGN_LOAD <v1,v2,rt>;
5892 vs += va;
5893 v1 = v2;
5896 } */
5898 if (DR_IS_READ (dr))
5900 bool is_packed = false;
5901 tree type = (TREE_TYPE (DR_REF (dr)));
5903 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5904 && (!targetm.vectorize.builtin_mask_for_load
5905 || targetm.vectorize.builtin_mask_for_load ()))
5907 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5909 /* If we are doing SLP then the accesses need not have the
5910 same alignment, instead it depends on the SLP group size. */
5911 if (loop_vinfo
5912 && STMT_SLP_TYPE (stmt_info)
5913 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5914 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5915 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5917 else if (!loop_vinfo
5918 || (nested_in_vect_loop
5919 && (TREE_INT_CST_LOW (DR_STEP (dr))
5920 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5921 return dr_explicit_realign;
5922 else
5923 return dr_explicit_realign_optimized;
5925 if (!known_alignment_for_access_p (dr))
5926 is_packed = not_size_aligned (DR_REF (dr));
5928 if (targetm.vectorize.support_vector_misalignment
5929 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5930 /* Can't software pipeline the loads, but can at least do them. */
5931 return dr_unaligned_supported;
5933 else
5935 bool is_packed = false;
5936 tree type = (TREE_TYPE (DR_REF (dr)));
5938 if (!known_alignment_for_access_p (dr))
5939 is_packed = not_size_aligned (DR_REF (dr));
5941 if (targetm.vectorize.support_vector_misalignment
5942 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5943 return dr_unaligned_supported;
5946 /* Unsupported. */
5947 return dr_unaligned_unsupported;