2015-11-26 Paolo Bonzini <bonzini@gnu.org>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob62e61e04a75bab21dd3abe80b04e3b0093a42c1a
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
270 return true;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 return true;
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
344 if (dist == 0)
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
390 continue;
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
444 return true;
447 return false;
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
459 unsigned int i;
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
479 return true;
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
503 if (dra == drb)
504 return false;
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
554 group. */
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
560 continue;
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 continue;
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 if (!dr_b)
576 return false;
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
585 continue;
586 unsigned i;
587 gimple *store;
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
597 return false;
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
607 return false;
609 free_dependence_relation (ddr);
612 return true;
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632 store = NULL;
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
636 if (store)
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 return false;
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
647 bool res = true;
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
651 slp_tree load;
652 unsigned int i;
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
655 store
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
659 res = false;
660 break;
663 /* Unset the visited flag. */
664 if (store)
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
668 return res;
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
675 Output:
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
691 tree vectype;
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
694 tree aligned_to;
695 unsigned HOST_WIDE_INT alignment;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE, vect_location,
699 "vect_compute_data_ref_alignment:\n");
701 if (loop_vinfo)
702 loop = LOOP_VINFO_LOOP (loop_vinfo);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr)))
708 misalign = DR_INIT (dr);
709 aligned_to = DR_ALIGNED_TO (dr);
710 base_addr = DR_BASE_ADDRESS (dr);
711 vectype = STMT_VINFO_VECTYPE (stmt_info);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop && nested_in_vect_loop_p (loop, stmt))
721 tree step = DR_STEP (dr);
723 if (tree_fits_shwi_p (step)
724 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "inner step divides the vector-size.\n");
729 misalign = STMT_VINFO_DR_INIT (stmt_info);
730 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
731 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
733 else
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "inner step doesn't divide the vector-size.\n");
738 misalign = NULL_TREE;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
746 else
748 tree step = DR_STEP (dr);
749 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
751 if (tree_fits_shwi_p (step)
752 && ((tree_to_shwi (step) * vf)
753 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "step doesn't divide the vector-size.\n");
758 misalign = NULL_TREE;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
764 base = ref;
765 while (handled_component_p (base))
766 base = TREE_OPERAND (base, 0);
767 if (TREE_CODE (base) == MEM_REF)
768 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
770 unsigned int base_alignment = get_object_alignment (base);
772 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
773 DR_VECT_AUX (dr)->base_element_aligned = true;
775 alignment = TYPE_ALIGN_UNIT (vectype);
777 if ((compare_tree_int (aligned_to, alignment) < 0)
778 || !misalign)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
790 if (base_alignment < TYPE_ALIGN (vectype))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base, 1))
795 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
796 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
805 dump_printf (MSG_NOTE, "\n");
807 return true;
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
817 dump_printf (MSG_NOTE, "\n");
820 DR_VECT_AUX (dr)->base_decl = base;
821 DR_VECT_AUX (dr)->base_misaligned = true;
822 DR_VECT_AUX (dr)->base_element_aligned = true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
830 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
834 /* PLUS because DR_STEP was negative. */
835 misalign = size_binop (PLUS_EXPR, misalign, offset);
838 SET_DR_MISALIGNMENT (dr,
839 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
849 return true;
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
861 static void
862 vect_update_misalignment_for_peel (struct data_reference *dr,
863 struct data_reference *dr_peel, int npeel)
865 unsigned int i;
866 vec<dr_p> same_align_drs;
867 struct data_reference *current_dr;
868 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
869 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
870 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
871 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
876 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
878 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
882 same_align_drs
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
884 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
886 if (current_dr != dr)
887 continue;
888 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
889 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
890 SET_DR_MISALIGNMENT (dr, 0);
891 return;
894 if (known_alignment_for_access_p (dr)
895 && known_alignment_for_access_p (dr_peel))
897 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
898 int misal = DR_MISALIGNMENT (dr);
899 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
900 misal += negative ? -npeel * dr_size : npeel * dr_size;
901 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
902 SET_DR_MISALIGNMENT (dr, misal);
903 return;
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
916 static bool
917 verify_data_ref_alignment (data_reference_p dr)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr, false);
921 if (!supportable_dr_alignment)
923 if (dump_enabled_p ())
925 if (DR_IS_READ (dr))
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
927 "not vectorized: unsupported unaligned load.");
928 else
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "not vectorized: unsupported unaligned "
931 "store.");
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
934 DR_REF (dr));
935 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
937 return false;
940 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location,
942 "Vectorizing an unaligned access.\n");
944 return true;
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
952 bool
953 vect_verify_datarefs_alignment (loop_vec_info vinfo)
955 vec<data_reference_p> datarefs = vinfo->datarefs;
956 struct data_reference *dr;
957 unsigned int i;
959 FOR_EACH_VEC_ELT (datarefs, i, dr)
961 gimple *stmt = DR_STMT (dr);
962 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
965 continue;
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
969 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
970 continue;
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
976 continue;
978 if (! verify_data_ref_alignment (dr))
979 return false;
982 return true;
985 /* Given an memory reference EXP return whether its alignment is less
986 than its size. */
988 static bool
989 not_size_aligned (tree exp)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
992 return true;
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
995 > get_object_alignment (exp));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1003 static bool
1004 vector_alignment_reachable_p (struct data_reference *dr)
1006 gimple *stmt = DR_STMT (dr);
1007 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size, mis_in_elements;
1016 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr))
1020 return false;
1022 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1023 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1025 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1026 return false;
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1033 HOST_WIDE_INT elmsize =
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE, vect_location,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1039 dump_printf (MSG_NOTE,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1042 if (DR_MISALIGNMENT (dr) % elmsize)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "data size does not divide the misalignment.\n");
1047 return false;
1051 if (!known_alignment_for_access_p (dr))
1053 tree type = TREE_TYPE (DR_REF (dr));
1054 bool is_packed = not_size_aligned (DR_REF (dr));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Unknown misalignment, is_packed = %d\n",is_packed);
1058 if ((TYPE_USER_ALIGN (type) && !is_packed)
1059 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1060 return true;
1061 else
1062 return false;
1065 return true;
1069 /* Calculate the cost of the memory access represented by DR. */
1071 static void
1072 vect_get_data_access_cost (struct data_reference *dr,
1073 unsigned int *inside_cost,
1074 unsigned int *outside_cost,
1075 stmt_vector_for_cost *body_cost_vec)
1077 gimple *stmt = DR_STMT (dr);
1078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1079 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1081 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1082 int ncopies = vf / nunits;
1084 if (DR_IS_READ (dr))
1085 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1086 NULL, body_cost_vec, false);
1087 else
1088 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1097 typedef struct _vect_peel_info
1099 int npeel;
1100 struct data_reference *dr;
1101 unsigned int count;
1102 } *vect_peel_info;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info;
1107 unsigned int inside_cost;
1108 unsigned int outside_cost;
1109 stmt_vector_for_cost body_cost_vec;
1110 } *vect_peel_extended_info;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1117 static inline hashval_t hash (const _vect_peel_info *);
1118 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1121 inline hashval_t
1122 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1124 return (hashval_t) peel_info->npeel;
1127 inline bool
1128 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1130 return (a->npeel == b->npeel);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1136 static void
1137 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1138 loop_vec_info loop_vinfo, struct data_reference *dr,
1139 int npeel)
1141 struct _vect_peel_info elem, *slot;
1142 _vect_peel_info **new_slot;
1143 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1145 elem.npeel = npeel;
1146 slot = peeling_htab->find (&elem);
1147 if (slot)
1148 slot->count++;
1149 else
1151 slot = XNEW (struct _vect_peel_info);
1152 slot->npeel = npeel;
1153 slot->dr = dr;
1154 slot->count = 1;
1155 new_slot = peeling_htab->find_slot (slot, INSERT);
1156 *new_slot = slot;
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1161 slot->count += VECT_MAX_COST;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1170 _vect_peel_extended_info *max)
1172 vect_peel_info elem = *slot;
1174 if (elem->count > max->peel_info.count
1175 || (elem->count == max->peel_info.count
1176 && max->peel_info.npeel > elem->npeel))
1178 max->peel_info.npeel = elem->npeel;
1179 max->peel_info.count = elem->count;
1180 max->peel_info.dr = elem->dr;
1183 return 1;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1192 _vect_peel_extended_info *min)
1194 vect_peel_info elem = *slot;
1195 int save_misalignment, dummy;
1196 unsigned int inside_cost = 0, outside_cost = 0, i;
1197 gimple *stmt = DR_STMT (elem->dr);
1198 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1199 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1200 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1201 struct data_reference *dr;
1202 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1204 prologue_cost_vec.create (2);
1205 body_cost_vec.create (2);
1206 epilogue_cost_vec.create (2);
1208 FOR_EACH_VEC_ELT (datarefs, i, dr)
1210 stmt = DR_STMT (dr);
1211 stmt_info = vinfo_for_stmt (stmt);
1212 /* For interleaving, only the alignment of the first access
1213 matters. */
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1215 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1216 continue;
1218 save_misalignment = DR_MISALIGNMENT (dr);
1219 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1220 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1221 &body_cost_vec);
1222 SET_DR_MISALIGNMENT (dr, save_misalignment);
1225 outside_cost += vect_get_known_peeling_cost
1226 (loop_vinfo, elem->npeel, &dummy,
1227 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1228 &prologue_cost_vec, &epilogue_cost_vec);
1230 /* Prologue and epilogue costs are added to the target model later.
1231 These costs depend only on the scalar iteration cost, the
1232 number of peeling iterations finally chosen, and the number of
1233 misaligned statements. So discard the information found here. */
1234 prologue_cost_vec.release ();
1235 epilogue_cost_vec.release ();
1237 if (inside_cost < min->inside_cost
1238 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1240 min->inside_cost = inside_cost;
1241 min->outside_cost = outside_cost;
1242 min->body_cost_vec.release ();
1243 min->body_cost_vec = body_cost_vec;
1244 min->peel_info.dr = elem->dr;
1245 min->peel_info.npeel = elem->npeel;
1247 else
1248 body_cost_vec.release ();
1250 return 1;
1254 /* Choose best peeling option by traversing peeling hash table and either
1255 choosing an option with the lowest cost (if cost model is enabled) or the
1256 option that aligns as many accesses as possible. */
1258 static struct data_reference *
1259 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1260 loop_vec_info loop_vinfo,
1261 unsigned int *npeel,
1262 stmt_vector_for_cost *body_cost_vec)
1264 struct _vect_peel_extended_info res;
1266 res.peel_info.dr = NULL;
1267 res.body_cost_vec = stmt_vector_for_cost ();
1269 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1271 res.inside_cost = INT_MAX;
1272 res.outside_cost = INT_MAX;
1273 peeling_htab->traverse <_vect_peel_extended_info *,
1274 vect_peeling_hash_get_lowest_cost> (&res);
1276 else
1278 res.peel_info.count = 0;
1279 peeling_htab->traverse <_vect_peel_extended_info *,
1280 vect_peeling_hash_get_most_frequent> (&res);
1283 *npeel = res.peel_info.npeel;
1284 *body_cost_vec = res.body_cost_vec;
1285 return res.peel_info.dr;
1289 /* Function vect_enhance_data_refs_alignment
1291 This pass will use loop versioning and loop peeling in order to enhance
1292 the alignment of data references in the loop.
1294 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1295 original loop is to be vectorized. Any other loops that are created by
1296 the transformations performed in this pass - are not supposed to be
1297 vectorized. This restriction will be relaxed.
1299 This pass will require a cost model to guide it whether to apply peeling
1300 or versioning or a combination of the two. For example, the scheme that
1301 intel uses when given a loop with several memory accesses, is as follows:
1302 choose one memory access ('p') which alignment you want to force by doing
1303 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1304 other accesses are not necessarily aligned, or (2) use loop versioning to
1305 generate one loop in which all accesses are aligned, and another loop in
1306 which only 'p' is necessarily aligned.
1308 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1309 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1310 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1312 Devising a cost model is the most critical aspect of this work. It will
1313 guide us on which access to peel for, whether to use loop versioning, how
1314 many versions to create, etc. The cost model will probably consist of
1315 generic considerations as well as target specific considerations (on
1316 powerpc for example, misaligned stores are more painful than misaligned
1317 loads).
1319 Here are the general steps involved in alignment enhancements:
1321 -- original loop, before alignment analysis:
1322 for (i=0; i<N; i++){
1323 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1324 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1327 -- After vect_compute_data_refs_alignment:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = 3
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- Possibility 1: we do loop versioning:
1334 if (p is aligned) {
1335 for (i=0; i<N; i++){ # loop 1A
1336 x = q[i]; # DR_MISALIGNMENT(q) = 3
1337 p[i] = y; # DR_MISALIGNMENT(p) = 0
1340 else {
1341 for (i=0; i<N; i++){ # loop 1B
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1347 -- Possibility 2: we do loop peeling:
1348 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1349 x = q[i];
1350 p[i] = y;
1352 for (i = 3; i < N; i++){ # loop 2A
1353 x = q[i]; # DR_MISALIGNMENT(q) = 0
1354 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1357 -- Possibility 3: combination of loop peeling and versioning:
1358 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1359 x = q[i];
1360 p[i] = y;
1362 if (p is aligned) {
1363 for (i = 3; i<N; i++){ # loop 3A
1364 x = q[i]; # DR_MISALIGNMENT(q) = 0
1365 p[i] = y; # DR_MISALIGNMENT(p) = 0
1368 else {
1369 for (i = 3; i<N; i++){ # loop 3B
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1375 These loops are later passed to loop_transform to be vectorized. The
1376 vectorizer will use the alignment information to guide the transformation
1377 (whether to generate regular loads/stores, or with special handling for
1378 misalignment). */
1380 bool
1381 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1383 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1384 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1385 enum dr_alignment_support supportable_dr_alignment;
1386 struct data_reference *dr0 = NULL, *first_store = NULL;
1387 struct data_reference *dr;
1388 unsigned int i, j;
1389 bool do_peeling = false;
1390 bool do_versioning = false;
1391 bool stat;
1392 gimple *stmt;
1393 stmt_vec_info stmt_info;
1394 unsigned int npeel = 0;
1395 bool all_misalignments_unknown = true;
1396 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1397 unsigned possible_npeel_number = 1;
1398 tree vectype;
1399 unsigned int nelements, mis, same_align_drs_max = 0;
1400 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1401 hash_table<peel_info_hasher> peeling_htab (1);
1403 if (dump_enabled_p ())
1404 dump_printf_loc (MSG_NOTE, vect_location,
1405 "=== vect_enhance_data_refs_alignment ===\n");
1407 /* Reset data so we can safely be called multiple times. */
1408 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1409 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1411 /* While cost model enhancements are expected in the future, the high level
1412 view of the code at this time is as follows:
1414 A) If there is a misaligned access then see if peeling to align
1415 this access can make all data references satisfy
1416 vect_supportable_dr_alignment. If so, update data structures
1417 as needed and return true.
1419 B) If peeling wasn't possible and there is a data reference with an
1420 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1421 then see if loop versioning checks can be used to make all data
1422 references satisfy vect_supportable_dr_alignment. If so, update
1423 data structures as needed and return true.
1425 C) If neither peeling nor versioning were successful then return false if
1426 any data reference does not satisfy vect_supportable_dr_alignment.
1428 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1430 Note, Possibility 3 above (which is peeling and versioning together) is not
1431 being done at this time. */
1433 /* (1) Peeling to force alignment. */
1435 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1436 Considerations:
1437 + How many accesses will become aligned due to the peeling
1438 - How many accesses will become unaligned due to the peeling,
1439 and the cost of misaligned accesses.
1440 - The cost of peeling (the extra runtime checks, the increase
1441 in code size). */
1443 FOR_EACH_VEC_ELT (datarefs, i, dr)
1445 stmt = DR_STMT (dr);
1446 stmt_info = vinfo_for_stmt (stmt);
1448 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1449 continue;
1451 /* For interleaving, only the alignment of the first access
1452 matters. */
1453 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1454 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1455 continue;
1457 /* For invariant accesses there is nothing to enhance. */
1458 if (integer_zerop (DR_STEP (dr)))
1459 continue;
1461 /* Strided accesses perform only component accesses, alignment is
1462 irrelevant for them. */
1463 if (STMT_VINFO_STRIDED_P (stmt_info)
1464 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1465 continue;
1467 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1468 do_peeling = vector_alignment_reachable_p (dr);
1469 if (do_peeling)
1471 if (known_alignment_for_access_p (dr))
1473 unsigned int npeel_tmp;
1474 bool negative = tree_int_cst_compare (DR_STEP (dr),
1475 size_zero_node) < 0;
1477 /* Save info about DR in the hash table. */
1478 vectype = STMT_VINFO_VECTYPE (stmt_info);
1479 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1480 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1481 TREE_TYPE (DR_REF (dr))));
1482 npeel_tmp = (negative
1483 ? (mis - nelements) : (nelements - mis))
1484 & (nelements - 1);
1486 /* For multiple types, it is possible that the bigger type access
1487 will have more than one peeling option. E.g., a loop with two
1488 types: one of size (vector size / 4), and the other one of
1489 size (vector size / 8). Vectorization factor will 8. If both
1490 access are misaligned by 3, the first one needs one scalar
1491 iteration to be aligned, and the second one needs 5. But the
1492 the first one will be aligned also by peeling 5 scalar
1493 iterations, and in that case both accesses will be aligned.
1494 Hence, except for the immediate peeling amount, we also want
1495 to try to add full vector size, while we don't exceed
1496 vectorization factor.
1497 We do this automtically for cost model, since we calculate cost
1498 for every peeling option. */
1499 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1501 if (STMT_SLP_TYPE (stmt_info))
1502 possible_npeel_number
1503 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1504 else
1505 possible_npeel_number = vf / nelements;
1508 /* Handle the aligned case. We may decide to align some other
1509 access, making DR unaligned. */
1510 if (DR_MISALIGNMENT (dr) == 0)
1512 npeel_tmp = 0;
1513 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514 possible_npeel_number++;
1517 for (j = 0; j < possible_npeel_number; j++)
1519 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1520 dr, npeel_tmp);
1521 npeel_tmp += nelements;
1524 all_misalignments_unknown = false;
1525 /* Data-ref that was chosen for the case that all the
1526 misalignments are unknown is not relevant anymore, since we
1527 have a data-ref with known alignment. */
1528 dr0 = NULL;
1530 else
1532 /* If we don't know any misalignment values, we prefer
1533 peeling for data-ref that has the maximum number of data-refs
1534 with the same alignment, unless the target prefers to align
1535 stores over load. */
1536 if (all_misalignments_unknown)
1538 unsigned same_align_drs
1539 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1540 if (!dr0
1541 || same_align_drs_max < same_align_drs)
1543 same_align_drs_max = same_align_drs;
1544 dr0 = dr;
1546 /* For data-refs with the same number of related
1547 accesses prefer the one where the misalign
1548 computation will be invariant in the outermost loop. */
1549 else if (same_align_drs_max == same_align_drs)
1551 struct loop *ivloop0, *ivloop;
1552 ivloop0 = outermost_invariant_loop_for_expr
1553 (loop, DR_BASE_ADDRESS (dr0));
1554 ivloop = outermost_invariant_loop_for_expr
1555 (loop, DR_BASE_ADDRESS (dr));
1556 if ((ivloop && !ivloop0)
1557 || (ivloop && ivloop0
1558 && flow_loop_nested_p (ivloop, ivloop0)))
1559 dr0 = dr;
1562 if (!first_store && DR_IS_WRITE (dr))
1563 first_store = dr;
1566 /* If there are both known and unknown misaligned accesses in the
1567 loop, we choose peeling amount according to the known
1568 accesses. */
1569 if (!supportable_dr_alignment)
1571 dr0 = dr;
1572 if (!first_store && DR_IS_WRITE (dr))
1573 first_store = dr;
1577 else
1579 if (!aligned_access_p (dr))
1581 if (dump_enabled_p ())
1582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1583 "vector alignment may not be reachable\n");
1584 break;
1589 /* Check if we can possibly peel the loop. */
1590 if (!vect_can_advance_ivs_p (loop_vinfo)
1591 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1592 || loop->inner)
1593 do_peeling = false;
1595 if (do_peeling
1596 && all_misalignments_unknown
1597 && vect_supportable_dr_alignment (dr0, false))
1599 /* Check if the target requires to prefer stores over loads, i.e., if
1600 misaligned stores are more expensive than misaligned loads (taking
1601 drs with same alignment into account). */
1602 if (first_store && DR_IS_READ (dr0))
1604 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1605 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1606 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1607 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1608 stmt_vector_for_cost dummy;
1609 dummy.create (2);
1611 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1612 &dummy);
1613 vect_get_data_access_cost (first_store, &store_inside_cost,
1614 &store_outside_cost, &dummy);
1616 dummy.release ();
1618 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1619 aligning the load DR0). */
1620 load_inside_penalty = store_inside_cost;
1621 load_outside_penalty = store_outside_cost;
1622 for (i = 0;
1623 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1624 DR_STMT (first_store))).iterate (i, &dr);
1625 i++)
1626 if (DR_IS_READ (dr))
1628 load_inside_penalty += load_inside_cost;
1629 load_outside_penalty += load_outside_cost;
1631 else
1633 load_inside_penalty += store_inside_cost;
1634 load_outside_penalty += store_outside_cost;
1637 /* Calculate the penalty for leaving DR0 unaligned (by
1638 aligning the FIRST_STORE). */
1639 store_inside_penalty = load_inside_cost;
1640 store_outside_penalty = load_outside_cost;
1641 for (i = 0;
1642 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1643 DR_STMT (dr0))).iterate (i, &dr);
1644 i++)
1645 if (DR_IS_READ (dr))
1647 store_inside_penalty += load_inside_cost;
1648 store_outside_penalty += load_outside_cost;
1650 else
1652 store_inside_penalty += store_inside_cost;
1653 store_outside_penalty += store_outside_cost;
1656 if (load_inside_penalty > store_inside_penalty
1657 || (load_inside_penalty == store_inside_penalty
1658 && load_outside_penalty > store_outside_penalty))
1659 dr0 = first_store;
1662 /* In case there are only loads with different unknown misalignments, use
1663 peeling only if it may help to align other accesses in the loop or
1664 if it may help improving load bandwith when we'd end up using
1665 unaligned loads. */
1666 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1667 if (!first_store
1668 && !STMT_VINFO_SAME_ALIGN_REFS (
1669 vinfo_for_stmt (DR_STMT (dr0))).length ()
1670 && (vect_supportable_dr_alignment (dr0, false)
1671 != dr_unaligned_supported
1672 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1673 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1674 do_peeling = false;
1677 if (do_peeling && !dr0)
1679 /* Peeling is possible, but there is no data access that is not supported
1680 unless aligned. So we try to choose the best possible peeling. */
1682 /* We should get here only if there are drs with known misalignment. */
1683 gcc_assert (!all_misalignments_unknown);
1685 /* Choose the best peeling from the hash table. */
1686 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1687 loop_vinfo, &npeel,
1688 &body_cost_vec);
1689 if (!dr0 || !npeel)
1690 do_peeling = false;
1693 if (do_peeling)
1695 stmt = DR_STMT (dr0);
1696 stmt_info = vinfo_for_stmt (stmt);
1697 vectype = STMT_VINFO_VECTYPE (stmt_info);
1698 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1700 if (known_alignment_for_access_p (dr0))
1702 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1703 size_zero_node) < 0;
1704 if (!npeel)
1706 /* Since it's known at compile time, compute the number of
1707 iterations in the peeled loop (the peeling factor) for use in
1708 updating DR_MISALIGNMENT values. The peeling factor is the
1709 vectorization factor minus the misalignment as an element
1710 count. */
1711 mis = DR_MISALIGNMENT (dr0);
1712 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1713 npeel = ((negative ? mis - nelements : nelements - mis)
1714 & (nelements - 1));
1717 /* For interleaved data access every iteration accesses all the
1718 members of the group, therefore we divide the number of iterations
1719 by the group size. */
1720 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1721 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1722 npeel /= GROUP_SIZE (stmt_info);
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "Try peeling by %d\n", npeel);
1729 /* Ensure that all data refs can be vectorized after the peel. */
1730 FOR_EACH_VEC_ELT (datarefs, i, dr)
1732 int save_misalignment;
1734 if (dr == dr0)
1735 continue;
1737 stmt = DR_STMT (dr);
1738 stmt_info = vinfo_for_stmt (stmt);
1739 /* For interleaving, only the alignment of the first access
1740 matters. */
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1743 continue;
1745 /* Strided accesses perform only component accesses, alignment is
1746 irrelevant for them. */
1747 if (STMT_VINFO_STRIDED_P (stmt_info)
1748 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1749 continue;
1751 save_misalignment = DR_MISALIGNMENT (dr);
1752 vect_update_misalignment_for_peel (dr, dr0, npeel);
1753 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1754 SET_DR_MISALIGNMENT (dr, save_misalignment);
1756 if (!supportable_dr_alignment)
1758 do_peeling = false;
1759 break;
1763 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1765 stat = vect_verify_datarefs_alignment (loop_vinfo);
1766 if (!stat)
1767 do_peeling = false;
1768 else
1770 body_cost_vec.release ();
1771 return stat;
1775 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1776 if (do_peeling)
1778 unsigned max_allowed_peel
1779 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1780 if (max_allowed_peel != (unsigned)-1)
1782 unsigned max_peel = npeel;
1783 if (max_peel == 0)
1785 gimple *dr_stmt = DR_STMT (dr0);
1786 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1787 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1788 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1790 if (max_peel > max_allowed_peel)
1792 do_peeling = false;
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_NOTE, vect_location,
1795 "Disable peeling, max peels reached: %d\n", max_peel);
1800 /* Cost model #2 - if peeling may result in a remaining loop not
1801 iterating enough to be vectorized then do not peel. */
1802 if (do_peeling
1803 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1805 unsigned max_peel
1806 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1807 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1808 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1809 do_peeling = false;
1812 if (do_peeling)
1814 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1815 If the misalignment of DR_i is identical to that of dr0 then set
1816 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1817 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1818 by the peeling factor times the element size of DR_i (MOD the
1819 vectorization factor times the size). Otherwise, the
1820 misalignment of DR_i must be set to unknown. */
1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
1822 if (dr != dr0)
1823 vect_update_misalignment_for_peel (dr, dr0, npeel);
1825 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1826 if (npeel)
1827 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1828 else
1829 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1830 = DR_MISALIGNMENT (dr0);
1831 SET_DR_MISALIGNMENT (dr0, 0);
1832 if (dump_enabled_p ())
1834 dump_printf_loc (MSG_NOTE, vect_location,
1835 "Alignment of access forced using peeling.\n");
1836 dump_printf_loc (MSG_NOTE, vect_location,
1837 "Peeling for alignment will be applied.\n");
1839 /* The inside-loop cost will be accounted for in vectorizable_load
1840 and vectorizable_store correctly with adjusted alignments.
1841 Drop the body_cst_vec on the floor here. */
1842 body_cost_vec.release ();
1844 stat = vect_verify_datarefs_alignment (loop_vinfo);
1845 gcc_assert (stat);
1846 return stat;
1850 body_cost_vec.release ();
1852 /* (2) Versioning to force alignment. */
1854 /* Try versioning if:
1855 1) optimize loop for speed
1856 2) there is at least one unsupported misaligned data ref with an unknown
1857 misalignment, and
1858 3) all misaligned data refs with a known misalignment are supported, and
1859 4) the number of runtime alignment checks is within reason. */
1861 do_versioning =
1862 optimize_loop_nest_for_speed_p (loop)
1863 && (!loop->inner); /* FORNOW */
1865 if (do_versioning)
1867 FOR_EACH_VEC_ELT (datarefs, i, dr)
1869 stmt = DR_STMT (dr);
1870 stmt_info = vinfo_for_stmt (stmt);
1872 /* For interleaving, only the alignment of the first access
1873 matters. */
1874 if (aligned_access_p (dr)
1875 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1876 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1877 continue;
1879 if (STMT_VINFO_STRIDED_P (stmt_info))
1881 /* Strided loads perform only component accesses, alignment is
1882 irrelevant for them. */
1883 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1884 continue;
1885 do_versioning = false;
1886 break;
1889 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1891 if (!supportable_dr_alignment)
1893 gimple *stmt;
1894 int mask;
1895 tree vectype;
1897 if (known_alignment_for_access_p (dr)
1898 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1899 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1901 do_versioning = false;
1902 break;
1905 stmt = DR_STMT (dr);
1906 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1907 gcc_assert (vectype);
1909 /* The rightmost bits of an aligned address must be zeros.
1910 Construct the mask needed for this test. For example,
1911 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1912 mask must be 15 = 0xf. */
1913 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1915 /* FORNOW: use the same mask to test all potentially unaligned
1916 references in the loop. The vectorizer currently supports
1917 a single vector size, see the reference to
1918 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1919 vectorization factor is computed. */
1920 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1921 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1922 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1923 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1924 DR_STMT (dr));
1928 /* Versioning requires at least one misaligned data reference. */
1929 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1930 do_versioning = false;
1931 else if (!do_versioning)
1932 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1935 if (do_versioning)
1937 vec<gimple *> may_misalign_stmts
1938 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1939 gimple *stmt;
1941 /* It can now be assumed that the data references in the statements
1942 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1943 of the loop being vectorized. */
1944 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1947 dr = STMT_VINFO_DATA_REF (stmt_info);
1948 SET_DR_MISALIGNMENT (dr, 0);
1949 if (dump_enabled_p ())
1950 dump_printf_loc (MSG_NOTE, vect_location,
1951 "Alignment of access forced using versioning.\n");
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE, vect_location,
1956 "Versioning for alignment will be applied.\n");
1958 /* Peeling and versioning can't be done together at this time. */
1959 gcc_assert (! (do_peeling && do_versioning));
1961 stat = vect_verify_datarefs_alignment (loop_vinfo);
1962 gcc_assert (stat);
1963 return stat;
1966 /* This point is reached if neither peeling nor versioning is being done. */
1967 gcc_assert (! (do_peeling || do_versioning));
1969 stat = vect_verify_datarefs_alignment (loop_vinfo);
1970 return stat;
1974 /* Function vect_find_same_alignment_drs.
1976 Update group and alignment relations according to the chosen
1977 vectorization factor. */
1979 static void
1980 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1981 loop_vec_info loop_vinfo)
1983 unsigned int i;
1984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1985 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1986 struct data_reference *dra = DDR_A (ddr);
1987 struct data_reference *drb = DDR_B (ddr);
1988 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1989 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1990 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1991 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1992 lambda_vector dist_v;
1993 unsigned int loop_depth;
1995 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1996 return;
1998 if (dra == drb)
1999 return;
2001 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2002 return;
2004 /* Loop-based vectorization and known data dependence. */
2005 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2006 return;
2008 /* Data-dependence analysis reports a distance vector of zero
2009 for data-references that overlap only in the first iteration
2010 but have different sign step (see PR45764).
2011 So as a sanity check require equal DR_STEP. */
2012 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2013 return;
2015 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2016 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2018 int dist = dist_v[loop_depth];
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE, vect_location,
2022 "dependence distance = %d.\n", dist);
2024 /* Same loop iteration. */
2025 if (dist == 0
2026 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2028 /* Two references with distance zero have the same alignment. */
2029 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2030 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2031 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "accesses have the same alignment.\n");
2035 dump_printf (MSG_NOTE,
2036 "dependence distance modulo vf == 0 between ");
2037 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2038 dump_printf (MSG_NOTE, " and ");
2039 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2040 dump_printf (MSG_NOTE, "\n");
2047 /* Function vect_analyze_data_refs_alignment
2049 Analyze the alignment of the data-references in the loop.
2050 Return FALSE if a data reference is found that cannot be vectorized. */
2052 bool
2053 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "=== vect_analyze_data_refs_alignment ===\n");
2059 /* Mark groups of data references with same alignment using
2060 data dependence information. */
2061 vec<ddr_p> ddrs = vinfo->ddrs;
2062 struct data_dependence_relation *ddr;
2063 unsigned int i;
2065 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2066 vect_find_same_alignment_drs (ddr, vinfo);
2068 vec<data_reference_p> datarefs = vinfo->datarefs;
2069 struct data_reference *dr;
2071 FOR_EACH_VEC_ELT (datarefs, i, dr)
2073 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2074 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2075 && !vect_compute_data_ref_alignment (dr))
2077 /* Strided accesses perform only component accesses, misalignment
2078 information is irrelevant for them. */
2079 if (STMT_VINFO_STRIDED_P (stmt_info)
2080 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2081 continue;
2083 if (dump_enabled_p ())
2084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2085 "not vectorized: can't calculate alignment "
2086 "for data ref.\n");
2088 return false;
2092 return true;
2096 /* Analyze alignment of DRs of stmts in NODE. */
2098 static bool
2099 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2101 /* We vectorize from the first scalar stmt in the node unless
2102 the node is permuted in which case we start from the first
2103 element in the group. */
2104 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2105 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2106 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2108 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2109 if (! vect_compute_data_ref_alignment (dr)
2110 || ! verify_data_ref_alignment (dr))
2112 if (dump_enabled_p ())
2113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2114 "not vectorized: bad data alignment in basic "
2115 "block.\n");
2116 return false;
2119 return true;
2122 /* Function vect_slp_analyze_instance_alignment
2124 Analyze the alignment of the data-references in the SLP instance.
2125 Return FALSE if a data reference is found that cannot be vectorized. */
2127 bool
2128 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_NOTE, vect_location,
2132 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2134 slp_tree node;
2135 unsigned i;
2136 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2137 if (! vect_slp_analyze_and_verify_node_alignment (node))
2138 return false;
2140 node = SLP_INSTANCE_TREE (instance);
2141 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2142 && ! vect_slp_analyze_and_verify_node_alignment
2143 (SLP_INSTANCE_TREE (instance)))
2144 return false;
2146 return true;
2150 /* Analyze groups of accesses: check that DR belongs to a group of
2151 accesses of legal size, step, etc. Detect gaps, single element
2152 interleaving, and other special cases. Set grouped access info.
2153 Collect groups of strided stores for further use in SLP analysis.
2154 Worker for vect_analyze_group_access. */
2156 static bool
2157 vect_analyze_group_access_1 (struct data_reference *dr)
2159 tree step = DR_STEP (dr);
2160 tree scalar_type = TREE_TYPE (DR_REF (dr));
2161 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2162 gimple *stmt = DR_STMT (dr);
2163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2164 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2165 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2166 HOST_WIDE_INT dr_step = -1;
2167 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2168 bool slp_impossible = false;
2169 struct loop *loop = NULL;
2171 if (loop_vinfo)
2172 loop = LOOP_VINFO_LOOP (loop_vinfo);
2174 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2175 size of the interleaving group (including gaps). */
2176 if (tree_fits_shwi_p (step))
2178 dr_step = tree_to_shwi (step);
2179 /* Check that STEP is a multiple of type size. Otherwise there is
2180 a non-element-sized gap at the end of the group which we
2181 cannot represent in GROUP_GAP or GROUP_SIZE.
2182 ??? As we can handle non-constant step fine here we should
2183 simply remove uses of GROUP_GAP between the last and first
2184 element and instead rely on DR_STEP. GROUP_SIZE then would
2185 simply not include that gap. */
2186 if ((dr_step % type_size) != 0)
2188 if (dump_enabled_p ())
2190 dump_printf_loc (MSG_NOTE, vect_location,
2191 "Step ");
2192 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2193 dump_printf (MSG_NOTE,
2194 " is not a multiple of the element size for ");
2195 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2196 dump_printf (MSG_NOTE, "\n");
2198 return false;
2200 groupsize = absu_hwi (dr_step) / type_size;
2202 else
2203 groupsize = 0;
2205 /* Not consecutive access is possible only if it is a part of interleaving. */
2206 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2208 /* Check if it this DR is a part of interleaving, and is a single
2209 element of the group that is accessed in the loop. */
2211 /* Gaps are supported only for loads. STEP must be a multiple of the type
2212 size. The size of the group must be a power of 2. */
2213 if (DR_IS_READ (dr)
2214 && (dr_step % type_size) == 0
2215 && groupsize > 0
2216 && exact_log2 (groupsize) != -1)
2218 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2219 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2220 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_NOTE, vect_location,
2223 "Detected single element interleaving ");
2224 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2225 dump_printf (MSG_NOTE, " step ");
2226 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2227 dump_printf (MSG_NOTE, "\n");
2230 if (loop_vinfo)
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE, vect_location,
2234 "Data access with gaps requires scalar "
2235 "epilogue loop\n");
2236 if (loop->inner)
2238 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240 "Peeling for outer loop is not"
2241 " supported\n");
2242 return false;
2245 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2248 return true;
2251 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254 "not consecutive access ");
2255 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2258 if (bb_vinfo)
2260 /* Mark the statement as unvectorizable. */
2261 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2262 return true;
2265 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2266 STMT_VINFO_STRIDED_P (stmt_info) = true;
2267 return true;
2270 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2272 /* First stmt in the interleaving chain. Check the chain. */
2273 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2274 struct data_reference *data_ref = dr;
2275 unsigned int count = 1;
2276 tree prev_init = DR_INIT (data_ref);
2277 gimple *prev = stmt;
2278 HOST_WIDE_INT diff, gaps = 0;
2280 while (next)
2282 /* Skip same data-refs. In case that two or more stmts share
2283 data-ref (supported only for loads), we vectorize only the first
2284 stmt, and the rest get their vectorized loads from the first
2285 one. */
2286 if (!tree_int_cst_compare (DR_INIT (data_ref),
2287 DR_INIT (STMT_VINFO_DATA_REF (
2288 vinfo_for_stmt (next)))))
2290 if (DR_IS_WRITE (data_ref))
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294 "Two store stmts share the same dr.\n");
2295 return false;
2298 if (dump_enabled_p ())
2299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2300 "Two or more load stmts share the same dr.\n");
2302 /* For load use the same data-ref load. */
2303 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2305 prev = next;
2306 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2307 continue;
2310 prev = next;
2311 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2313 /* All group members have the same STEP by construction. */
2314 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2316 /* Check that the distance between two accesses is equal to the type
2317 size. Otherwise, we have gaps. */
2318 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2319 - TREE_INT_CST_LOW (prev_init)) / type_size;
2320 if (diff != 1)
2322 /* FORNOW: SLP of accesses with gaps is not supported. */
2323 slp_impossible = true;
2324 if (DR_IS_WRITE (data_ref))
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328 "interleaved store with gaps\n");
2329 return false;
2332 gaps += diff - 1;
2335 last_accessed_element += diff;
2337 /* Store the gap from the previous member of the group. If there is no
2338 gap in the access, GROUP_GAP is always 1. */
2339 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2341 prev_init = DR_INIT (data_ref);
2342 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2343 /* Count the number of data-refs in the chain. */
2344 count++;
2347 if (groupsize == 0)
2348 groupsize = count + gaps;
2350 if (groupsize > UINT_MAX)
2352 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354 "group is too large\n");
2355 return false;
2358 /* Check that the size of the interleaving is equal to count for stores,
2359 i.e., that there are no gaps. */
2360 if (groupsize != count
2361 && !DR_IS_READ (dr))
2363 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2365 "interleaved store with gaps\n");
2366 return false;
2369 /* If there is a gap after the last load in the group it is the
2370 difference between the groupsize and the last accessed
2371 element.
2372 When there is no gap, this difference should be 0. */
2373 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2375 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2376 if (dump_enabled_p ())
2378 dump_printf_loc (MSG_NOTE, vect_location,
2379 "Detected interleaving ");
2380 if (DR_IS_READ (dr))
2381 dump_printf (MSG_NOTE, "load ");
2382 else
2383 dump_printf (MSG_NOTE, "store ");
2384 dump_printf (MSG_NOTE, "of size %u starting with ",
2385 (unsigned)groupsize);
2386 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2387 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2388 dump_printf_loc (MSG_NOTE, vect_location,
2389 "There is a gap of %u elements after the group\n",
2390 GROUP_GAP (vinfo_for_stmt (stmt)));
2393 /* SLP: create an SLP data structure for every interleaving group of
2394 stores for further analysis in vect_analyse_slp. */
2395 if (DR_IS_WRITE (dr) && !slp_impossible)
2397 if (loop_vinfo)
2398 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2399 if (bb_vinfo)
2400 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2403 /* If there is a gap in the end of the group or the group size cannot
2404 be made a multiple of the vector element count then we access excess
2405 elements in the last iteration and thus need to peel that off. */
2406 if (loop_vinfo
2407 && (groupsize - last_accessed_element > 0
2408 || exact_log2 (groupsize) == -1))
2411 if (dump_enabled_p ())
2412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2413 "Data access with gaps requires scalar "
2414 "epilogue loop\n");
2415 if (loop->inner)
2417 if (dump_enabled_p ())
2418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2419 "Peeling for outer loop is not supported\n");
2420 return false;
2423 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2427 return true;
2430 /* Analyze groups of accesses: check that DR belongs to a group of
2431 accesses of legal size, step, etc. Detect gaps, single element
2432 interleaving, and other special cases. Set grouped access info.
2433 Collect groups of strided stores for further use in SLP analysis. */
2435 static bool
2436 vect_analyze_group_access (struct data_reference *dr)
2438 if (!vect_analyze_group_access_1 (dr))
2440 /* Dissolve the group if present. */
2441 gimple *next;
2442 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2443 while (stmt)
2445 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2446 next = GROUP_NEXT_ELEMENT (vinfo);
2447 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2448 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2449 stmt = next;
2451 return false;
2453 return true;
2456 /* Analyze the access pattern of the data-reference DR.
2457 In case of non-consecutive accesses call vect_analyze_group_access() to
2458 analyze groups of accesses. */
2460 static bool
2461 vect_analyze_data_ref_access (struct data_reference *dr)
2463 tree step = DR_STEP (dr);
2464 tree scalar_type = TREE_TYPE (DR_REF (dr));
2465 gimple *stmt = DR_STMT (dr);
2466 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2467 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2468 struct loop *loop = NULL;
2470 if (loop_vinfo)
2471 loop = LOOP_VINFO_LOOP (loop_vinfo);
2473 if (loop_vinfo && !step)
2475 if (dump_enabled_p ())
2476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2477 "bad data-ref access in loop\n");
2478 return false;
2481 /* Allow loads with zero step in inner-loop vectorization. */
2482 if (loop_vinfo && integer_zerop (step))
2484 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2485 if (!nested_in_vect_loop_p (loop, stmt))
2486 return DR_IS_READ (dr);
2487 /* Allow references with zero step for outer loops marked
2488 with pragma omp simd only - it guarantees absence of
2489 loop-carried dependencies between inner loop iterations. */
2490 if (!loop->force_vectorize)
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_NOTE, vect_location,
2494 "zero step in inner loop of nest\n");
2495 return false;
2499 if (loop && nested_in_vect_loop_p (loop, stmt))
2501 /* Interleaved accesses are not yet supported within outer-loop
2502 vectorization for references in the inner-loop. */
2503 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2505 /* For the rest of the analysis we use the outer-loop step. */
2506 step = STMT_VINFO_DR_STEP (stmt_info);
2507 if (integer_zerop (step))
2509 if (dump_enabled_p ())
2510 dump_printf_loc (MSG_NOTE, vect_location,
2511 "zero step in outer loop.\n");
2512 return DR_IS_READ (dr);
2516 /* Consecutive? */
2517 if (TREE_CODE (step) == INTEGER_CST)
2519 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2520 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2521 || (dr_step < 0
2522 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2524 /* Mark that it is not interleaving. */
2525 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2526 return true;
2530 if (loop && nested_in_vect_loop_p (loop, stmt))
2532 if (dump_enabled_p ())
2533 dump_printf_loc (MSG_NOTE, vect_location,
2534 "grouped access in outer loop.\n");
2535 return false;
2539 /* Assume this is a DR handled by non-constant strided load case. */
2540 if (TREE_CODE (step) != INTEGER_CST)
2541 return (STMT_VINFO_STRIDED_P (stmt_info)
2542 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2543 || vect_analyze_group_access (dr)));
2545 /* Not consecutive access - check if it's a part of interleaving group. */
2546 return vect_analyze_group_access (dr);
2551 /* A helper function used in the comparator function to sort data
2552 references. T1 and T2 are two data references to be compared.
2553 The function returns -1, 0, or 1. */
2555 static int
2556 compare_tree (tree t1, tree t2)
2558 int i, cmp;
2559 enum tree_code code;
2560 char tclass;
2562 if (t1 == t2)
2563 return 0;
2564 if (t1 == NULL)
2565 return -1;
2566 if (t2 == NULL)
2567 return 1;
2569 STRIP_NOPS (t1);
2570 STRIP_NOPS (t2);
2572 if (TREE_CODE (t1) != TREE_CODE (t2))
2573 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2575 code = TREE_CODE (t1);
2576 switch (code)
2578 /* For const values, we can just use hash values for comparisons. */
2579 case INTEGER_CST:
2580 case REAL_CST:
2581 case FIXED_CST:
2582 case STRING_CST:
2583 case COMPLEX_CST:
2584 case VECTOR_CST:
2586 hashval_t h1 = iterative_hash_expr (t1, 0);
2587 hashval_t h2 = iterative_hash_expr (t2, 0);
2588 if (h1 != h2)
2589 return h1 < h2 ? -1 : 1;
2590 break;
2593 case SSA_NAME:
2594 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2595 if (cmp != 0)
2596 return cmp;
2598 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2599 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2600 break;
2602 default:
2603 tclass = TREE_CODE_CLASS (code);
2605 /* For var-decl, we could compare their UIDs. */
2606 if (tclass == tcc_declaration)
2608 if (DECL_UID (t1) != DECL_UID (t2))
2609 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2610 break;
2613 /* For expressions with operands, compare their operands recursively. */
2614 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2616 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2617 if (cmp != 0)
2618 return cmp;
2622 return 0;
2626 /* Compare two data-references DRA and DRB to group them into chunks
2627 suitable for grouping. */
2629 static int
2630 dr_group_sort_cmp (const void *dra_, const void *drb_)
2632 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2633 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2634 int cmp;
2636 /* Stabilize sort. */
2637 if (dra == drb)
2638 return 0;
2640 /* Ordering of DRs according to base. */
2641 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2643 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2644 if (cmp != 0)
2645 return cmp;
2648 /* And according to DR_OFFSET. */
2649 if (!dr_equal_offsets_p (dra, drb))
2651 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2652 if (cmp != 0)
2653 return cmp;
2656 /* Put reads before writes. */
2657 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2658 return DR_IS_READ (dra) ? -1 : 1;
2660 /* Then sort after access size. */
2661 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2662 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2664 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2665 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2666 if (cmp != 0)
2667 return cmp;
2670 /* And after step. */
2671 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2673 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2674 if (cmp != 0)
2675 return cmp;
2678 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2679 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2680 if (cmp == 0)
2681 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2682 return cmp;
2685 /* Function vect_analyze_data_ref_accesses.
2687 Analyze the access pattern of all the data references in the loop.
2689 FORNOW: the only access pattern that is considered vectorizable is a
2690 simple step 1 (consecutive) access.
2692 FORNOW: handle only arrays and pointer accesses. */
2694 bool
2695 vect_analyze_data_ref_accesses (vec_info *vinfo)
2697 unsigned int i;
2698 vec<data_reference_p> datarefs = vinfo->datarefs;
2699 struct data_reference *dr;
2701 if (dump_enabled_p ())
2702 dump_printf_loc (MSG_NOTE, vect_location,
2703 "=== vect_analyze_data_ref_accesses ===\n");
2705 if (datarefs.is_empty ())
2706 return true;
2708 /* Sort the array of datarefs to make building the interleaving chains
2709 linear. Don't modify the original vector's order, it is needed for
2710 determining what dependencies are reversed. */
2711 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2712 datarefs_copy.qsort (dr_group_sort_cmp);
2714 /* Build the interleaving chains. */
2715 for (i = 0; i < datarefs_copy.length () - 1;)
2717 data_reference_p dra = datarefs_copy[i];
2718 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2719 stmt_vec_info lastinfo = NULL;
2720 for (i = i + 1; i < datarefs_copy.length (); ++i)
2722 data_reference_p drb = datarefs_copy[i];
2723 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2725 /* ??? Imperfect sorting (non-compatible types, non-modulo
2726 accesses, same accesses) can lead to a group to be artificially
2727 split here as we don't just skip over those. If it really
2728 matters we can push those to a worklist and re-iterate
2729 over them. The we can just skip ahead to the next DR here. */
2731 /* Check that the data-refs have same first location (except init)
2732 and they are both either store or load (not load and store,
2733 not masked loads or stores). */
2734 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2735 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2736 DR_BASE_ADDRESS (drb), 0)
2737 || !dr_equal_offsets_p (dra, drb)
2738 || !gimple_assign_single_p (DR_STMT (dra))
2739 || !gimple_assign_single_p (DR_STMT (drb)))
2740 break;
2742 /* Check that the data-refs have the same constant size. */
2743 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2744 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2745 if (!tree_fits_uhwi_p (sza)
2746 || !tree_fits_uhwi_p (szb)
2747 || !tree_int_cst_equal (sza, szb))
2748 break;
2750 /* Check that the data-refs have the same step. */
2751 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2752 break;
2754 /* Do not place the same access in the interleaving chain twice. */
2755 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2756 break;
2758 /* Check the types are compatible.
2759 ??? We don't distinguish this during sorting. */
2760 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2761 TREE_TYPE (DR_REF (drb))))
2762 break;
2764 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2765 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2766 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2767 gcc_assert (init_a < init_b);
2769 /* If init_b == init_a + the size of the type * k, we have an
2770 interleaving, and DRA is accessed before DRB. */
2771 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2772 if (type_size_a == 0
2773 || (init_b - init_a) % type_size_a != 0)
2774 break;
2776 /* If we have a store, the accesses are adjacent. This splits
2777 groups into chunks we support (we don't support vectorization
2778 of stores with gaps). */
2779 if (!DR_IS_READ (dra)
2780 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2781 (DR_INIT (datarefs_copy[i-1]))
2782 != type_size_a))
2783 break;
2785 /* If the step (if not zero or non-constant) is greater than the
2786 difference between data-refs' inits this splits groups into
2787 suitable sizes. */
2788 if (tree_fits_shwi_p (DR_STEP (dra)))
2790 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2791 if (step != 0 && step <= (init_b - init_a))
2792 break;
2795 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "Detected interleaving ");
2799 if (DR_IS_READ (dra))
2800 dump_printf (MSG_NOTE, "load ");
2801 else
2802 dump_printf (MSG_NOTE, "store ");
2803 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2804 dump_printf (MSG_NOTE, " and ");
2805 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2806 dump_printf (MSG_NOTE, "\n");
2809 /* Link the found element into the group list. */
2810 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2812 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2813 lastinfo = stmtinfo_a;
2815 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2816 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2817 lastinfo = stmtinfo_b;
2821 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2822 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2823 && !vect_analyze_data_ref_access (dr))
2825 if (dump_enabled_p ())
2826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2827 "not vectorized: complicated access pattern.\n");
2829 if (is_a <bb_vec_info> (vinfo))
2831 /* Mark the statement as not vectorizable. */
2832 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2833 continue;
2835 else
2837 datarefs_copy.release ();
2838 return false;
2842 datarefs_copy.release ();
2843 return true;
2847 /* Operator == between two dr_with_seg_len objects.
2849 This equality operator is used to make sure two data refs
2850 are the same one so that we will consider to combine the
2851 aliasing checks of those two pairs of data dependent data
2852 refs. */
2854 static bool
2855 operator == (const dr_with_seg_len& d1,
2856 const dr_with_seg_len& d2)
2858 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2859 DR_BASE_ADDRESS (d2.dr), 0)
2860 && compare_tree (d1.offset, d2.offset) == 0
2861 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2864 /* Function comp_dr_with_seg_len_pair.
2866 Comparison function for sorting objects of dr_with_seg_len_pair_t
2867 so that we can combine aliasing checks in one scan. */
2869 static int
2870 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2872 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2873 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2875 const dr_with_seg_len &p11 = p1->first,
2876 &p12 = p1->second,
2877 &p21 = p2->first,
2878 &p22 = p2->second;
2880 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2881 if a and c have the same basic address snd step, and b and d have the same
2882 address and step. Therefore, if any a&c or b&d don't have the same address
2883 and step, we don't care the order of those two pairs after sorting. */
2884 int comp_res;
2886 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2887 DR_BASE_ADDRESS (p21.dr))) != 0)
2888 return comp_res;
2889 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2890 DR_BASE_ADDRESS (p22.dr))) != 0)
2891 return comp_res;
2892 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2893 return comp_res;
2894 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2895 return comp_res;
2896 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2897 return comp_res;
2898 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2899 return comp_res;
2901 return 0;
2904 /* Function vect_vfa_segment_size.
2906 Create an expression that computes the size of segment
2907 that will be accessed for a data reference. The functions takes into
2908 account that realignment loads may access one more vector.
2910 Input:
2911 DR: The data reference.
2912 LENGTH_FACTOR: segment length to consider.
2914 Return an expression whose value is the size of segment which will be
2915 accessed by DR. */
2917 static tree
2918 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2920 tree segment_length;
2922 if (integer_zerop (DR_STEP (dr)))
2923 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2924 else
2925 segment_length = size_binop (MULT_EXPR,
2926 fold_convert (sizetype, DR_STEP (dr)),
2927 fold_convert (sizetype, length_factor));
2929 if (vect_supportable_dr_alignment (dr, false)
2930 == dr_explicit_realign_optimized)
2932 tree vector_size = TYPE_SIZE_UNIT
2933 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2935 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2937 return segment_length;
2940 /* Function vect_prune_runtime_alias_test_list.
2942 Prune a list of ddrs to be tested at run-time by versioning for alias.
2943 Merge several alias checks into one if possible.
2944 Return FALSE if resulting list of ddrs is longer then allowed by
2945 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2947 bool
2948 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2950 vec<ddr_p> may_alias_ddrs =
2951 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2952 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2953 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2954 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2955 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2957 ddr_p ddr;
2958 unsigned int i;
2959 tree length_factor;
2961 if (dump_enabled_p ())
2962 dump_printf_loc (MSG_NOTE, vect_location,
2963 "=== vect_prune_runtime_alias_test_list ===\n");
2965 if (may_alias_ddrs.is_empty ())
2966 return true;
2968 /* Basically, for each pair of dependent data refs store_ptr_0
2969 and load_ptr_0, we create an expression:
2971 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2972 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2974 for aliasing checks. However, in some cases we can decrease
2975 the number of checks by combining two checks into one. For
2976 example, suppose we have another pair of data refs store_ptr_0
2977 and load_ptr_1, and if the following condition is satisfied:
2979 load_ptr_0 < load_ptr_1 &&
2980 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2982 (this condition means, in each iteration of vectorized loop,
2983 the accessed memory of store_ptr_0 cannot be between the memory
2984 of load_ptr_0 and load_ptr_1.)
2986 we then can use only the following expression to finish the
2987 alising checks between store_ptr_0 & load_ptr_0 and
2988 store_ptr_0 & load_ptr_1:
2990 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2991 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2993 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2994 same basic address. */
2996 comp_alias_ddrs.create (may_alias_ddrs.length ());
2998 /* First, we collect all data ref pairs for aliasing checks. */
2999 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3001 struct data_reference *dr_a, *dr_b;
3002 gimple *dr_group_first_a, *dr_group_first_b;
3003 tree segment_length_a, segment_length_b;
3004 gimple *stmt_a, *stmt_b;
3006 dr_a = DDR_A (ddr);
3007 stmt_a = DR_STMT (DDR_A (ddr));
3008 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3009 if (dr_group_first_a)
3011 stmt_a = dr_group_first_a;
3012 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3015 dr_b = DDR_B (ddr);
3016 stmt_b = DR_STMT (DDR_B (ddr));
3017 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3018 if (dr_group_first_b)
3020 stmt_b = dr_group_first_b;
3021 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3024 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3025 length_factor = scalar_loop_iters;
3026 else
3027 length_factor = size_int (vect_factor);
3028 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3029 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3031 dr_with_seg_len_pair_t dr_with_seg_len_pair
3032 (dr_with_seg_len (dr_a, segment_length_a),
3033 dr_with_seg_len (dr_b, segment_length_b));
3035 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3036 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3038 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3041 /* Second, we sort the collected data ref pairs so that we can scan
3042 them once to combine all possible aliasing checks. */
3043 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3045 /* Third, we scan the sorted dr pairs and check if we can combine
3046 alias checks of two neighbouring dr pairs. */
3047 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3049 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3050 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3051 *dr_b1 = &comp_alias_ddrs[i-1].second,
3052 *dr_a2 = &comp_alias_ddrs[i].first,
3053 *dr_b2 = &comp_alias_ddrs[i].second;
3055 /* Remove duplicate data ref pairs. */
3056 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3058 if (dump_enabled_p ())
3060 dump_printf_loc (MSG_NOTE, vect_location,
3061 "found equal ranges ");
3062 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3063 DR_REF (dr_a1->dr));
3064 dump_printf (MSG_NOTE, ", ");
3065 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3066 DR_REF (dr_b1->dr));
3067 dump_printf (MSG_NOTE, " and ");
3068 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3069 DR_REF (dr_a2->dr));
3070 dump_printf (MSG_NOTE, ", ");
3071 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3072 DR_REF (dr_b2->dr));
3073 dump_printf (MSG_NOTE, "\n");
3076 comp_alias_ddrs.ordered_remove (i--);
3077 continue;
3080 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3082 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3083 and DR_A1 and DR_A2 are two consecutive memrefs. */
3084 if (*dr_a1 == *dr_a2)
3086 std::swap (dr_a1, dr_b1);
3087 std::swap (dr_a2, dr_b2);
3090 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3091 DR_BASE_ADDRESS (dr_a2->dr),
3093 || !tree_fits_shwi_p (dr_a1->offset)
3094 || !tree_fits_shwi_p (dr_a2->offset))
3095 continue;
3097 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
3098 - tree_to_shwi (dr_a1->offset));
3101 /* Now we check if the following condition is satisfied:
3103 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3105 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3106 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3107 have to make a best estimation. We can get the minimum value
3108 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3109 then either of the following two conditions can guarantee the
3110 one above:
3112 1: DIFF <= MIN_SEG_LEN_B
3113 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3117 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
3118 ? tree_to_shwi (dr_b1->seg_len)
3119 : vect_factor);
3121 if (diff <= min_seg_len_b
3122 || (tree_fits_shwi_p (dr_a1->seg_len)
3123 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
3125 if (dump_enabled_p ())
3127 dump_printf_loc (MSG_NOTE, vect_location,
3128 "merging ranges for ");
3129 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3130 DR_REF (dr_a1->dr));
3131 dump_printf (MSG_NOTE, ", ");
3132 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3133 DR_REF (dr_b1->dr));
3134 dump_printf (MSG_NOTE, " and ");
3135 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3136 DR_REF (dr_a2->dr));
3137 dump_printf (MSG_NOTE, ", ");
3138 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3139 DR_REF (dr_b2->dr));
3140 dump_printf (MSG_NOTE, "\n");
3143 dr_a1->seg_len = size_binop (PLUS_EXPR,
3144 dr_a2->seg_len, size_int (diff));
3145 comp_alias_ddrs.ordered_remove (i--);
3150 dump_printf_loc (MSG_NOTE, vect_location,
3151 "improved number of alias checks from %d to %d\n",
3152 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3153 if ((int) comp_alias_ddrs.length () >
3154 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3155 return false;
3157 return true;
3160 /* Check whether a non-affine read or write in stmt is suitable for gather load
3161 or scatter store and if so, return a builtin decl for that operation. */
3163 tree
3164 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3165 tree *offp, int *scalep)
3167 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3168 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3169 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3170 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3171 tree offtype = NULL_TREE;
3172 tree decl, base, off;
3173 machine_mode pmode;
3174 int punsignedp, reversep, pvolatilep = 0;
3176 base = DR_REF (dr);
3177 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3178 see if we can use the def stmt of the address. */
3179 if (is_gimple_call (stmt)
3180 && gimple_call_internal_p (stmt)
3181 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3182 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3183 && TREE_CODE (base) == MEM_REF
3184 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3185 && integer_zerop (TREE_OPERAND (base, 1))
3186 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3188 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3189 if (is_gimple_assign (def_stmt)
3190 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3191 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3194 /* The gather and scatter builtins need address of the form
3195 loop_invariant + vector * {1, 2, 4, 8}
3197 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3198 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3199 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3200 multiplications and additions in it. To get a vector, we need
3201 a single SSA_NAME that will be defined in the loop and will
3202 contain everything that is not loop invariant and that can be
3203 vectorized. The following code attempts to find such a preexistng
3204 SSA_NAME OFF and put the loop invariants into a tree BASE
3205 that can be gimplified before the loop. */
3206 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3207 &punsignedp, &reversep, &pvolatilep, false);
3208 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3210 if (TREE_CODE (base) == MEM_REF)
3212 if (!integer_zerop (TREE_OPERAND (base, 1)))
3214 if (off == NULL_TREE)
3216 offset_int moff = mem_ref_offset (base);
3217 off = wide_int_to_tree (sizetype, moff);
3219 else
3220 off = size_binop (PLUS_EXPR, off,
3221 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3223 base = TREE_OPERAND (base, 0);
3225 else
3226 base = build_fold_addr_expr (base);
3228 if (off == NULL_TREE)
3229 off = size_zero_node;
3231 /* If base is not loop invariant, either off is 0, then we start with just
3232 the constant offset in the loop invariant BASE and continue with base
3233 as OFF, otherwise give up.
3234 We could handle that case by gimplifying the addition of base + off
3235 into some SSA_NAME and use that as off, but for now punt. */
3236 if (!expr_invariant_in_loop_p (loop, base))
3238 if (!integer_zerop (off))
3239 return NULL_TREE;
3240 off = base;
3241 base = size_int (pbitpos / BITS_PER_UNIT);
3243 /* Otherwise put base + constant offset into the loop invariant BASE
3244 and continue with OFF. */
3245 else
3247 base = fold_convert (sizetype, base);
3248 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3251 /* OFF at this point may be either a SSA_NAME or some tree expression
3252 from get_inner_reference. Try to peel off loop invariants from it
3253 into BASE as long as possible. */
3254 STRIP_NOPS (off);
3255 while (offtype == NULL_TREE)
3257 enum tree_code code;
3258 tree op0, op1, add = NULL_TREE;
3260 if (TREE_CODE (off) == SSA_NAME)
3262 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3264 if (expr_invariant_in_loop_p (loop, off))
3265 return NULL_TREE;
3267 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3268 break;
3270 op0 = gimple_assign_rhs1 (def_stmt);
3271 code = gimple_assign_rhs_code (def_stmt);
3272 op1 = gimple_assign_rhs2 (def_stmt);
3274 else
3276 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3277 return NULL_TREE;
3278 code = TREE_CODE (off);
3279 extract_ops_from_tree (off, &code, &op0, &op1);
3281 switch (code)
3283 case POINTER_PLUS_EXPR:
3284 case PLUS_EXPR:
3285 if (expr_invariant_in_loop_p (loop, op0))
3287 add = op0;
3288 off = op1;
3289 do_add:
3290 add = fold_convert (sizetype, add);
3291 if (scale != 1)
3292 add = size_binop (MULT_EXPR, add, size_int (scale));
3293 base = size_binop (PLUS_EXPR, base, add);
3294 continue;
3296 if (expr_invariant_in_loop_p (loop, op1))
3298 add = op1;
3299 off = op0;
3300 goto do_add;
3302 break;
3303 case MINUS_EXPR:
3304 if (expr_invariant_in_loop_p (loop, op1))
3306 add = fold_convert (sizetype, op1);
3307 add = size_binop (MINUS_EXPR, size_zero_node, add);
3308 off = op0;
3309 goto do_add;
3311 break;
3312 case MULT_EXPR:
3313 if (scale == 1 && tree_fits_shwi_p (op1))
3315 scale = tree_to_shwi (op1);
3316 off = op0;
3317 continue;
3319 break;
3320 case SSA_NAME:
3321 off = op0;
3322 continue;
3323 CASE_CONVERT:
3324 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3325 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3326 break;
3327 if (TYPE_PRECISION (TREE_TYPE (op0))
3328 == TYPE_PRECISION (TREE_TYPE (off)))
3330 off = op0;
3331 continue;
3333 if (TYPE_PRECISION (TREE_TYPE (op0))
3334 < TYPE_PRECISION (TREE_TYPE (off)))
3336 off = op0;
3337 offtype = TREE_TYPE (off);
3338 STRIP_NOPS (off);
3339 continue;
3341 break;
3342 default:
3343 break;
3345 break;
3348 /* If at the end OFF still isn't a SSA_NAME or isn't
3349 defined in the loop, punt. */
3350 if (TREE_CODE (off) != SSA_NAME
3351 || expr_invariant_in_loop_p (loop, off))
3352 return NULL_TREE;
3354 if (offtype == NULL_TREE)
3355 offtype = TREE_TYPE (off);
3357 if (DR_IS_READ (dr))
3358 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3359 offtype, scale);
3360 else
3361 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3362 offtype, scale);
3364 if (decl == NULL_TREE)
3365 return NULL_TREE;
3367 if (basep)
3368 *basep = base;
3369 if (offp)
3370 *offp = off;
3371 if (scalep)
3372 *scalep = scale;
3373 return decl;
3376 /* Function vect_analyze_data_refs.
3378 Find all the data references in the loop or basic block.
3380 The general structure of the analysis of data refs in the vectorizer is as
3381 follows:
3382 1- vect_analyze_data_refs(loop/bb): call
3383 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3384 in the loop/bb and their dependences.
3385 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3386 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3387 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3391 bool
3392 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3394 struct loop *loop = NULL;
3395 unsigned int i;
3396 struct data_reference *dr;
3397 tree scalar_type;
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_NOTE, vect_location,
3401 "=== vect_analyze_data_refs ===\n");
3403 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3404 loop = LOOP_VINFO_LOOP (loop_vinfo);
3406 /* Go through the data-refs, check that the analysis succeeded. Update
3407 pointer from stmt_vec_info struct to DR and vectype. */
3409 vec<data_reference_p> datarefs = vinfo->datarefs;
3410 FOR_EACH_VEC_ELT (datarefs, i, dr)
3412 gimple *stmt;
3413 stmt_vec_info stmt_info;
3414 tree base, offset, init;
3415 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3416 bool simd_lane_access = false;
3417 int vf;
3419 again:
3420 if (!dr || !DR_REF (dr))
3422 if (dump_enabled_p ())
3423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3424 "not vectorized: unhandled data-ref\n");
3425 return false;
3428 stmt = DR_STMT (dr);
3429 stmt_info = vinfo_for_stmt (stmt);
3431 /* Discard clobbers from the dataref vector. We will remove
3432 clobber stmts during vectorization. */
3433 if (gimple_clobber_p (stmt))
3435 free_data_ref (dr);
3436 if (i == datarefs.length () - 1)
3438 datarefs.pop ();
3439 break;
3441 datarefs.ordered_remove (i);
3442 dr = datarefs[i];
3443 goto again;
3446 /* Check that analysis of the data-ref succeeded. */
3447 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3448 || !DR_STEP (dr))
3450 bool maybe_gather
3451 = DR_IS_READ (dr)
3452 && !TREE_THIS_VOLATILE (DR_REF (dr))
3453 && targetm.vectorize.builtin_gather != NULL;
3454 bool maybe_scatter
3455 = DR_IS_WRITE (dr)
3456 && !TREE_THIS_VOLATILE (DR_REF (dr))
3457 && targetm.vectorize.builtin_scatter != NULL;
3458 bool maybe_simd_lane_access
3459 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3461 /* If target supports vector gather loads or scatter stores, or if
3462 this might be a SIMD lane access, see if they can't be used. */
3463 if (is_a <loop_vec_info> (vinfo)
3464 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3465 && !nested_in_vect_loop_p (loop, stmt))
3467 struct data_reference *newdr
3468 = create_data_ref (NULL, loop_containing_stmt (stmt),
3469 DR_REF (dr), stmt, maybe_scatter ? false : true);
3470 gcc_assert (newdr != NULL && DR_REF (newdr));
3471 if (DR_BASE_ADDRESS (newdr)
3472 && DR_OFFSET (newdr)
3473 && DR_INIT (newdr)
3474 && DR_STEP (newdr)
3475 && integer_zerop (DR_STEP (newdr)))
3477 if (maybe_simd_lane_access)
3479 tree off = DR_OFFSET (newdr);
3480 STRIP_NOPS (off);
3481 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3482 && TREE_CODE (off) == MULT_EXPR
3483 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3485 tree step = TREE_OPERAND (off, 1);
3486 off = TREE_OPERAND (off, 0);
3487 STRIP_NOPS (off);
3488 if (CONVERT_EXPR_P (off)
3489 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3490 0)))
3491 < TYPE_PRECISION (TREE_TYPE (off)))
3492 off = TREE_OPERAND (off, 0);
3493 if (TREE_CODE (off) == SSA_NAME)
3495 gimple *def = SSA_NAME_DEF_STMT (off);
3496 tree reft = TREE_TYPE (DR_REF (newdr));
3497 if (is_gimple_call (def)
3498 && gimple_call_internal_p (def)
3499 && (gimple_call_internal_fn (def)
3500 == IFN_GOMP_SIMD_LANE))
3502 tree arg = gimple_call_arg (def, 0);
3503 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3504 arg = SSA_NAME_VAR (arg);
3505 if (arg == loop->simduid
3506 /* For now. */
3507 && tree_int_cst_equal
3508 (TYPE_SIZE_UNIT (reft),
3509 step))
3511 DR_OFFSET (newdr) = ssize_int (0);
3512 DR_STEP (newdr) = step;
3513 DR_ALIGNED_TO (newdr)
3514 = size_int (BIGGEST_ALIGNMENT);
3515 dr = newdr;
3516 simd_lane_access = true;
3522 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3524 dr = newdr;
3525 if (maybe_gather)
3526 gatherscatter = GATHER;
3527 else
3528 gatherscatter = SCATTER;
3531 if (gatherscatter == SG_NONE && !simd_lane_access)
3532 free_data_ref (newdr);
3535 if (gatherscatter == SG_NONE && !simd_lane_access)
3537 if (dump_enabled_p ())
3539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3540 "not vectorized: data ref analysis "
3541 "failed ");
3542 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3543 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3546 if (is_a <bb_vec_info> (vinfo))
3547 break;
3549 return false;
3553 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3555 if (dump_enabled_p ())
3556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3557 "not vectorized: base addr of dr is a "
3558 "constant\n");
3560 if (is_a <bb_vec_info> (vinfo))
3561 break;
3563 if (gatherscatter != SG_NONE || simd_lane_access)
3564 free_data_ref (dr);
3565 return false;
3568 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3570 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3573 "not vectorized: volatile type ");
3574 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3575 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3578 if (is_a <bb_vec_info> (vinfo))
3579 break;
3581 return false;
3584 if (stmt_can_throw_internal (stmt))
3586 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589 "not vectorized: statement can throw an "
3590 "exception ");
3591 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3592 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3595 if (is_a <bb_vec_info> (vinfo))
3596 break;
3598 if (gatherscatter != SG_NONE || simd_lane_access)
3599 free_data_ref (dr);
3600 return false;
3603 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3604 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3606 if (dump_enabled_p ())
3608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3609 "not vectorized: statement is bitfield "
3610 "access ");
3611 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3612 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3615 if (is_a <bb_vec_info> (vinfo))
3616 break;
3618 if (gatherscatter != SG_NONE || simd_lane_access)
3619 free_data_ref (dr);
3620 return false;
3623 base = unshare_expr (DR_BASE_ADDRESS (dr));
3624 offset = unshare_expr (DR_OFFSET (dr));
3625 init = unshare_expr (DR_INIT (dr));
3627 if (is_gimple_call (stmt)
3628 && (!gimple_call_internal_p (stmt)
3629 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3630 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3632 if (dump_enabled_p ())
3634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3635 "not vectorized: dr in a call ");
3636 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3637 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3640 if (is_a <bb_vec_info> (vinfo))
3641 break;
3643 if (gatherscatter != SG_NONE || simd_lane_access)
3644 free_data_ref (dr);
3645 return false;
3648 /* Update DR field in stmt_vec_info struct. */
3650 /* If the dataref is in an inner-loop of the loop that is considered for
3651 for vectorization, we also want to analyze the access relative to
3652 the outer-loop (DR contains information only relative to the
3653 inner-most enclosing loop). We do that by building a reference to the
3654 first location accessed by the inner-loop, and analyze it relative to
3655 the outer-loop. */
3656 if (loop && nested_in_vect_loop_p (loop, stmt))
3658 tree outer_step, outer_base, outer_init;
3659 HOST_WIDE_INT pbitsize, pbitpos;
3660 tree poffset;
3661 machine_mode pmode;
3662 int punsignedp, preversep, pvolatilep;
3663 affine_iv base_iv, offset_iv;
3664 tree dinit;
3666 /* Build a reference to the first location accessed by the
3667 inner-loop: *(BASE+INIT). (The first location is actually
3668 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3669 tree inner_base = build_fold_indirect_ref
3670 (fold_build_pointer_plus (base, init));
3672 if (dump_enabled_p ())
3674 dump_printf_loc (MSG_NOTE, vect_location,
3675 "analyze in outer-loop: ");
3676 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3677 dump_printf (MSG_NOTE, "\n");
3680 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3681 &poffset, &pmode, &punsignedp,
3682 &preversep, &pvolatilep, false);
3683 gcc_assert (outer_base != NULL_TREE);
3685 if (pbitpos % BITS_PER_UNIT != 0)
3687 if (dump_enabled_p ())
3688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3689 "failed: bit offset alignment.\n");
3690 return false;
3693 if (preversep)
3695 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697 "failed: reverse storage order.\n");
3698 return false;
3701 outer_base = build_fold_addr_expr (outer_base);
3702 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3703 &base_iv, false))
3705 if (dump_enabled_p ())
3706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3707 "failed: evolution of base is not affine.\n");
3708 return false;
3711 if (offset)
3713 if (poffset)
3714 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3715 poffset);
3716 else
3717 poffset = offset;
3720 if (!poffset)
3722 offset_iv.base = ssize_int (0);
3723 offset_iv.step = ssize_int (0);
3725 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3726 &offset_iv, false))
3728 if (dump_enabled_p ())
3729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3730 "evolution of offset is not affine.\n");
3731 return false;
3734 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3735 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3736 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3737 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3738 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3740 outer_step = size_binop (PLUS_EXPR,
3741 fold_convert (ssizetype, base_iv.step),
3742 fold_convert (ssizetype, offset_iv.step));
3744 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3745 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3746 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3747 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3748 STMT_VINFO_DR_OFFSET (stmt_info) =
3749 fold_convert (ssizetype, offset_iv.base);
3750 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3751 size_int (highest_pow2_factor (offset_iv.base));
3753 if (dump_enabled_p ())
3755 dump_printf_loc (MSG_NOTE, vect_location,
3756 "\touter base_address: ");
3757 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3758 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3759 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3760 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3761 STMT_VINFO_DR_OFFSET (stmt_info));
3762 dump_printf (MSG_NOTE,
3763 "\n\touter constant offset from base address: ");
3764 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3765 STMT_VINFO_DR_INIT (stmt_info));
3766 dump_printf (MSG_NOTE, "\n\touter step: ");
3767 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3768 STMT_VINFO_DR_STEP (stmt_info));
3769 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3770 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3772 dump_printf (MSG_NOTE, "\n");
3776 if (STMT_VINFO_DATA_REF (stmt_info))
3778 if (dump_enabled_p ())
3780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3781 "not vectorized: more than one data ref "
3782 "in stmt: ");
3783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3784 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3787 if (is_a <bb_vec_info> (vinfo))
3788 break;
3790 if (gatherscatter != SG_NONE || simd_lane_access)
3791 free_data_ref (dr);
3792 return false;
3795 STMT_VINFO_DATA_REF (stmt_info) = dr;
3796 if (simd_lane_access)
3798 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3799 free_data_ref (datarefs[i]);
3800 datarefs[i] = dr;
3803 /* Set vectype for STMT. */
3804 scalar_type = TREE_TYPE (DR_REF (dr));
3805 STMT_VINFO_VECTYPE (stmt_info)
3806 = get_vectype_for_scalar_type (scalar_type);
3807 if (!STMT_VINFO_VECTYPE (stmt_info))
3809 if (dump_enabled_p ())
3811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3812 "not vectorized: no vectype for stmt: ");
3813 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3814 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3815 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3816 scalar_type);
3817 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3820 if (is_a <bb_vec_info> (vinfo))
3822 /* No vector type is fine, the ref can still participate
3823 in dependence analysis, we just can't vectorize it. */
3824 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3825 continue;
3828 if (gatherscatter != SG_NONE || simd_lane_access)
3830 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3831 if (gatherscatter != SG_NONE)
3832 free_data_ref (dr);
3834 return false;
3836 else
3838 if (dump_enabled_p ())
3840 dump_printf_loc (MSG_NOTE, vect_location,
3841 "got vectype for stmt: ");
3842 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3843 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3844 STMT_VINFO_VECTYPE (stmt_info));
3845 dump_printf (MSG_NOTE, "\n");
3849 /* Adjust the minimal vectorization factor according to the
3850 vector type. */
3851 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3852 if (vf > *min_vf)
3853 *min_vf = vf;
3855 if (gatherscatter != SG_NONE)
3857 tree off;
3858 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3859 NULL, &off, NULL)
3860 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3862 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3863 free_data_ref (dr);
3864 if (dump_enabled_p ())
3866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3867 (gatherscatter == GATHER) ?
3868 "not vectorized: not suitable for gather "
3869 "load " :
3870 "not vectorized: not suitable for scatter "
3871 "store ");
3872 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3873 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3875 return false;
3878 datarefs[i] = dr;
3879 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3882 else if (is_a <loop_vec_info> (vinfo)
3883 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3885 if (nested_in_vect_loop_p (loop, stmt))
3887 if (dump_enabled_p ())
3889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3890 "not vectorized: not suitable for strided "
3891 "load ");
3892 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3893 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3895 return false;
3897 STMT_VINFO_STRIDED_P (stmt_info) = true;
3901 /* If we stopped analysis at the first dataref we could not analyze
3902 when trying to vectorize a basic-block mark the rest of the datarefs
3903 as not vectorizable and truncate the vector of datarefs. That
3904 avoids spending useless time in analyzing their dependence. */
3905 if (i != datarefs.length ())
3907 gcc_assert (is_a <bb_vec_info> (vinfo));
3908 for (unsigned j = i; j < datarefs.length (); ++j)
3910 data_reference_p dr = datarefs[j];
3911 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3912 free_data_ref (dr);
3914 datarefs.truncate (i);
3917 return true;
3921 /* Function vect_get_new_vect_var.
3923 Returns a name for a new variable. The current naming scheme appends the
3924 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3925 the name of vectorizer generated variables, and appends that to NAME if
3926 provided. */
3928 tree
3929 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3931 const char *prefix;
3932 tree new_vect_var;
3934 switch (var_kind)
3936 case vect_simple_var:
3937 prefix = "vect";
3938 break;
3939 case vect_scalar_var:
3940 prefix = "stmp";
3941 break;
3942 case vect_mask_var:
3943 prefix = "mask";
3944 break;
3945 case vect_pointer_var:
3946 prefix = "vectp";
3947 break;
3948 default:
3949 gcc_unreachable ();
3952 if (name)
3954 char* tmp = concat (prefix, "_", name, NULL);
3955 new_vect_var = create_tmp_reg (type, tmp);
3956 free (tmp);
3958 else
3959 new_vect_var = create_tmp_reg (type, prefix);
3961 return new_vect_var;
3964 /* Like vect_get_new_vect_var but return an SSA name. */
3966 tree
3967 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3969 const char *prefix;
3970 tree new_vect_var;
3972 switch (var_kind)
3974 case vect_simple_var:
3975 prefix = "vect";
3976 break;
3977 case vect_scalar_var:
3978 prefix = "stmp";
3979 break;
3980 case vect_pointer_var:
3981 prefix = "vectp";
3982 break;
3983 default:
3984 gcc_unreachable ();
3987 if (name)
3989 char* tmp = concat (prefix, "_", name, NULL);
3990 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3991 free (tmp);
3993 else
3994 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3996 return new_vect_var;
3999 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4001 static void
4002 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4003 stmt_vec_info stmt_info)
4005 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4006 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4007 int misalign = DR_MISALIGNMENT (dr);
4008 if (misalign == -1)
4009 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4010 else
4011 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4014 /* Function vect_create_addr_base_for_vector_ref.
4016 Create an expression that computes the address of the first memory location
4017 that will be accessed for a data reference.
4019 Input:
4020 STMT: The statement containing the data reference.
4021 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4022 OFFSET: Optional. If supplied, it is be added to the initial address.
4023 LOOP: Specify relative to which loop-nest should the address be computed.
4024 For example, when the dataref is in an inner-loop nested in an
4025 outer-loop that is now being vectorized, LOOP can be either the
4026 outer-loop, or the inner-loop. The first memory location accessed
4027 by the following dataref ('in' points to short):
4029 for (i=0; i<N; i++)
4030 for (j=0; j<M; j++)
4031 s += in[i+j]
4033 is as follows:
4034 if LOOP=i_loop: &in (relative to i_loop)
4035 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4036 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4037 initial address. Unlike OFFSET, which is number of elements to
4038 be added, BYTE_OFFSET is measured in bytes.
4040 Output:
4041 1. Return an SSA_NAME whose value is the address of the memory location of
4042 the first vector of the data reference.
4043 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4044 these statement(s) which define the returned SSA_NAME.
4046 FORNOW: We are only handling array accesses with step 1. */
4048 tree
4049 vect_create_addr_base_for_vector_ref (gimple *stmt,
4050 gimple_seq *new_stmt_list,
4051 tree offset,
4052 struct loop *loop,
4053 tree byte_offset)
4055 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4056 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4057 tree data_ref_base;
4058 const char *base_name;
4059 tree addr_base;
4060 tree dest;
4061 gimple_seq seq = NULL;
4062 tree base_offset;
4063 tree init;
4064 tree vect_ptr_type;
4065 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4066 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4068 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4070 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4072 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4074 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4075 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4076 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4078 else
4080 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4081 base_offset = unshare_expr (DR_OFFSET (dr));
4082 init = unshare_expr (DR_INIT (dr));
4085 if (loop_vinfo)
4086 base_name = get_name (data_ref_base);
4087 else
4089 base_offset = ssize_int (0);
4090 init = ssize_int (0);
4091 base_name = get_name (DR_REF (dr));
4094 /* Create base_offset */
4095 base_offset = size_binop (PLUS_EXPR,
4096 fold_convert (sizetype, base_offset),
4097 fold_convert (sizetype, init));
4099 if (offset)
4101 offset = fold_build2 (MULT_EXPR, sizetype,
4102 fold_convert (sizetype, offset), step);
4103 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4104 base_offset, offset);
4106 if (byte_offset)
4108 byte_offset = fold_convert (sizetype, byte_offset);
4109 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4110 base_offset, byte_offset);
4113 /* base + base_offset */
4114 if (loop_vinfo)
4115 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4116 else
4118 addr_base = build1 (ADDR_EXPR,
4119 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4120 unshare_expr (DR_REF (dr)));
4123 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4124 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4125 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4126 gimple_seq_add_seq (new_stmt_list, seq);
4128 if (DR_PTR_INFO (dr)
4129 && TREE_CODE (addr_base) == SSA_NAME
4130 && !SSA_NAME_PTR_INFO (addr_base))
4132 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4133 if (offset || byte_offset)
4134 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4137 if (dump_enabled_p ())
4139 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4140 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4141 dump_printf (MSG_NOTE, "\n");
4144 return addr_base;
4148 /* Function vect_create_data_ref_ptr.
4150 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4151 location accessed in the loop by STMT, along with the def-use update
4152 chain to appropriately advance the pointer through the loop iterations.
4153 Also set aliasing information for the pointer. This pointer is used by
4154 the callers to this function to create a memory reference expression for
4155 vector load/store access.
4157 Input:
4158 1. STMT: a stmt that references memory. Expected to be of the form
4159 GIMPLE_ASSIGN <name, data-ref> or
4160 GIMPLE_ASSIGN <data-ref, name>.
4161 2. AGGR_TYPE: the type of the reference, which should be either a vector
4162 or an array.
4163 3. AT_LOOP: the loop where the vector memref is to be created.
4164 4. OFFSET (optional): an offset to be added to the initial address accessed
4165 by the data-ref in STMT.
4166 5. BSI: location where the new stmts are to be placed if there is no loop
4167 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4168 pointing to the initial address.
4169 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4170 to the initial address accessed by the data-ref in STMT. This is
4171 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4172 in bytes.
4174 Output:
4175 1. Declare a new ptr to vector_type, and have it point to the base of the
4176 data reference (initial addressed accessed by the data reference).
4177 For example, for vector of type V8HI, the following code is generated:
4179 v8hi *ap;
4180 ap = (v8hi *)initial_address;
4182 if OFFSET is not supplied:
4183 initial_address = &a[init];
4184 if OFFSET is supplied:
4185 initial_address = &a[init + OFFSET];
4186 if BYTE_OFFSET is supplied:
4187 initial_address = &a[init] + BYTE_OFFSET;
4189 Return the initial_address in INITIAL_ADDRESS.
4191 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4192 update the pointer in each iteration of the loop.
4194 Return the increment stmt that updates the pointer in PTR_INCR.
4196 3. Set INV_P to true if the access pattern of the data reference in the
4197 vectorized loop is invariant. Set it to false otherwise.
4199 4. Return the pointer. */
4201 tree
4202 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4203 tree offset, tree *initial_address,
4204 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4205 bool only_init, bool *inv_p, tree byte_offset)
4207 const char *base_name;
4208 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4209 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4210 struct loop *loop = NULL;
4211 bool nested_in_vect_loop = false;
4212 struct loop *containing_loop = NULL;
4213 tree aggr_ptr_type;
4214 tree aggr_ptr;
4215 tree new_temp;
4216 gimple_seq new_stmt_list = NULL;
4217 edge pe = NULL;
4218 basic_block new_bb;
4219 tree aggr_ptr_init;
4220 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4221 tree aptr;
4222 gimple_stmt_iterator incr_gsi;
4223 bool insert_after;
4224 tree indx_before_incr, indx_after_incr;
4225 gimple *incr;
4226 tree step;
4227 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4229 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4230 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4232 if (loop_vinfo)
4234 loop = LOOP_VINFO_LOOP (loop_vinfo);
4235 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4236 containing_loop = (gimple_bb (stmt))->loop_father;
4237 pe = loop_preheader_edge (loop);
4239 else
4241 gcc_assert (bb_vinfo);
4242 only_init = true;
4243 *ptr_incr = NULL;
4246 /* Check the step (evolution) of the load in LOOP, and record
4247 whether it's invariant. */
4248 if (nested_in_vect_loop)
4249 step = STMT_VINFO_DR_STEP (stmt_info);
4250 else
4251 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4253 if (integer_zerop (step))
4254 *inv_p = true;
4255 else
4256 *inv_p = false;
4258 /* Create an expression for the first address accessed by this load
4259 in LOOP. */
4260 base_name = get_name (DR_BASE_ADDRESS (dr));
4262 if (dump_enabled_p ())
4264 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4265 dump_printf_loc (MSG_NOTE, vect_location,
4266 "create %s-pointer variable to type: ",
4267 get_tree_code_name (TREE_CODE (aggr_type)));
4268 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4269 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4270 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4271 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4272 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4273 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4274 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4275 else
4276 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4277 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4278 dump_printf (MSG_NOTE, "\n");
4281 /* (1) Create the new aggregate-pointer variable.
4282 Vector and array types inherit the alias set of their component
4283 type by default so we need to use a ref-all pointer if the data
4284 reference does not conflict with the created aggregated data
4285 reference because it is not addressable. */
4286 bool need_ref_all = false;
4287 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4288 get_alias_set (DR_REF (dr))))
4289 need_ref_all = true;
4290 /* Likewise for any of the data references in the stmt group. */
4291 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4293 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4296 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4297 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4298 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4299 get_alias_set (DR_REF (sdr))))
4301 need_ref_all = true;
4302 break;
4304 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4306 while (orig_stmt);
4308 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4309 need_ref_all);
4310 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4313 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4314 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4315 def-use update cycles for the pointer: one relative to the outer-loop
4316 (LOOP), which is what steps (3) and (4) below do. The other is relative
4317 to the inner-loop (which is the inner-most loop containing the dataref),
4318 and this is done be step (5) below.
4320 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4321 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4322 redundant. Steps (3),(4) create the following:
4324 vp0 = &base_addr;
4325 LOOP: vp1 = phi(vp0,vp2)
4328 vp2 = vp1 + step
4329 goto LOOP
4331 If there is an inner-loop nested in loop, then step (5) will also be
4332 applied, and an additional update in the inner-loop will be created:
4334 vp0 = &base_addr;
4335 LOOP: vp1 = phi(vp0,vp2)
4337 inner: vp3 = phi(vp1,vp4)
4338 vp4 = vp3 + inner_step
4339 if () goto inner
4341 vp2 = vp1 + step
4342 if () goto LOOP */
4344 /* (2) Calculate the initial address of the aggregate-pointer, and set
4345 the aggregate-pointer to point to it before the loop. */
4347 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4349 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4350 offset, loop, byte_offset);
4351 if (new_stmt_list)
4353 if (pe)
4355 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4356 gcc_assert (!new_bb);
4358 else
4359 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4362 *initial_address = new_temp;
4363 aggr_ptr_init = new_temp;
4365 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4366 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4367 inner-loop nested in LOOP (during outer-loop vectorization). */
4369 /* No update in loop is required. */
4370 if (only_init && (!loop_vinfo || at_loop == loop))
4371 aptr = aggr_ptr_init;
4372 else
4374 /* The step of the aggregate pointer is the type size. */
4375 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4376 /* One exception to the above is when the scalar step of the load in
4377 LOOP is zero. In this case the step here is also zero. */
4378 if (*inv_p)
4379 iv_step = size_zero_node;
4380 else if (tree_int_cst_sgn (step) == -1)
4381 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4383 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4385 create_iv (aggr_ptr_init,
4386 fold_convert (aggr_ptr_type, iv_step),
4387 aggr_ptr, loop, &incr_gsi, insert_after,
4388 &indx_before_incr, &indx_after_incr);
4389 incr = gsi_stmt (incr_gsi);
4390 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4392 /* Copy the points-to information if it exists. */
4393 if (DR_PTR_INFO (dr))
4395 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4396 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4398 if (ptr_incr)
4399 *ptr_incr = incr;
4401 aptr = indx_before_incr;
4404 if (!nested_in_vect_loop || only_init)
4405 return aptr;
4408 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4409 nested in LOOP, if exists. */
4411 gcc_assert (nested_in_vect_loop);
4412 if (!only_init)
4414 standard_iv_increment_position (containing_loop, &incr_gsi,
4415 &insert_after);
4416 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4417 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4418 &indx_after_incr);
4419 incr = gsi_stmt (incr_gsi);
4420 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4422 /* Copy the points-to information if it exists. */
4423 if (DR_PTR_INFO (dr))
4425 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4426 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4428 if (ptr_incr)
4429 *ptr_incr = incr;
4431 return indx_before_incr;
4433 else
4434 gcc_unreachable ();
4438 /* Function bump_vector_ptr
4440 Increment a pointer (to a vector type) by vector-size. If requested,
4441 i.e. if PTR-INCR is given, then also connect the new increment stmt
4442 to the existing def-use update-chain of the pointer, by modifying
4443 the PTR_INCR as illustrated below:
4445 The pointer def-use update-chain before this function:
4446 DATAREF_PTR = phi (p_0, p_2)
4447 ....
4448 PTR_INCR: p_2 = DATAREF_PTR + step
4450 The pointer def-use update-chain after this function:
4451 DATAREF_PTR = phi (p_0, p_2)
4452 ....
4453 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4454 ....
4455 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4457 Input:
4458 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4459 in the loop.
4460 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4461 the loop. The increment amount across iterations is expected
4462 to be vector_size.
4463 BSI - location where the new update stmt is to be placed.
4464 STMT - the original scalar memory-access stmt that is being vectorized.
4465 BUMP - optional. The offset by which to bump the pointer. If not given,
4466 the offset is assumed to be vector_size.
4468 Output: Return NEW_DATAREF_PTR as illustrated above.
4472 tree
4473 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4474 gimple *stmt, tree bump)
4476 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4477 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4478 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4479 tree update = TYPE_SIZE_UNIT (vectype);
4480 gassign *incr_stmt;
4481 ssa_op_iter iter;
4482 use_operand_p use_p;
4483 tree new_dataref_ptr;
4485 if (bump)
4486 update = bump;
4488 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4489 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4490 else
4491 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4492 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4493 dataref_ptr, update);
4494 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4496 /* Copy the points-to information if it exists. */
4497 if (DR_PTR_INFO (dr))
4499 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4500 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4503 if (!ptr_incr)
4504 return new_dataref_ptr;
4506 /* Update the vector-pointer's cross-iteration increment. */
4507 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4509 tree use = USE_FROM_PTR (use_p);
4511 if (use == dataref_ptr)
4512 SET_USE (use_p, new_dataref_ptr);
4513 else
4514 gcc_assert (tree_int_cst_compare (use, update) == 0);
4517 return new_dataref_ptr;
4521 /* Function vect_create_destination_var.
4523 Create a new temporary of type VECTYPE. */
4525 tree
4526 vect_create_destination_var (tree scalar_dest, tree vectype)
4528 tree vec_dest;
4529 const char *name;
4530 char *new_name;
4531 tree type;
4532 enum vect_var_kind kind;
4534 kind = vectype
4535 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4536 ? vect_mask_var
4537 : vect_simple_var
4538 : vect_scalar_var;
4539 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4541 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4543 name = get_name (scalar_dest);
4544 if (name)
4545 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4546 else
4547 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4548 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4549 free (new_name);
4551 return vec_dest;
4554 /* Function vect_grouped_store_supported.
4556 Returns TRUE if interleave high and interleave low permutations
4557 are supported, and FALSE otherwise. */
4559 bool
4560 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4562 machine_mode mode = TYPE_MODE (vectype);
4564 /* vect_permute_store_chain requires the group size to be equal to 3 or
4565 be a power of two. */
4566 if (count != 3 && exact_log2 (count) == -1)
4568 if (dump_enabled_p ())
4569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4570 "the size of the group of accesses"
4571 " is not a power of 2 or not eqaul to 3\n");
4572 return false;
4575 /* Check that the permutation is supported. */
4576 if (VECTOR_MODE_P (mode))
4578 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4579 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4581 if (count == 3)
4583 unsigned int j0 = 0, j1 = 0, j2 = 0;
4584 unsigned int i, j;
4586 for (j = 0; j < 3; j++)
4588 int nelt0 = ((3 - j) * nelt) % 3;
4589 int nelt1 = ((3 - j) * nelt + 1) % 3;
4590 int nelt2 = ((3 - j) * nelt + 2) % 3;
4591 for (i = 0; i < nelt; i++)
4593 if (3 * i + nelt0 < nelt)
4594 sel[3 * i + nelt0] = j0++;
4595 if (3 * i + nelt1 < nelt)
4596 sel[3 * i + nelt1] = nelt + j1++;
4597 if (3 * i + nelt2 < nelt)
4598 sel[3 * i + nelt2] = 0;
4600 if (!can_vec_perm_p (mode, false, sel))
4602 if (dump_enabled_p ())
4603 dump_printf (MSG_MISSED_OPTIMIZATION,
4604 "permutaion op not supported by target.\n");
4605 return false;
4608 for (i = 0; i < nelt; i++)
4610 if (3 * i + nelt0 < nelt)
4611 sel[3 * i + nelt0] = 3 * i + nelt0;
4612 if (3 * i + nelt1 < nelt)
4613 sel[3 * i + nelt1] = 3 * i + nelt1;
4614 if (3 * i + nelt2 < nelt)
4615 sel[3 * i + nelt2] = nelt + j2++;
4617 if (!can_vec_perm_p (mode, false, sel))
4619 if (dump_enabled_p ())
4620 dump_printf (MSG_MISSED_OPTIMIZATION,
4621 "permutaion op not supported by target.\n");
4622 return false;
4625 return true;
4627 else
4629 /* If length is not equal to 3 then only power of 2 is supported. */
4630 gcc_assert (exact_log2 (count) != -1);
4632 for (i = 0; i < nelt / 2; i++)
4634 sel[i * 2] = i;
4635 sel[i * 2 + 1] = i + nelt;
4637 if (can_vec_perm_p (mode, false, sel))
4639 for (i = 0; i < nelt; i++)
4640 sel[i] += nelt / 2;
4641 if (can_vec_perm_p (mode, false, sel))
4642 return true;
4647 if (dump_enabled_p ())
4648 dump_printf (MSG_MISSED_OPTIMIZATION,
4649 "permutaion op not supported by target.\n");
4650 return false;
4654 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4655 type VECTYPE. */
4657 bool
4658 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4660 return vect_lanes_optab_supported_p ("vec_store_lanes",
4661 vec_store_lanes_optab,
4662 vectype, count);
4666 /* Function vect_permute_store_chain.
4668 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4669 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4670 the data correctly for the stores. Return the final references for stores
4671 in RESULT_CHAIN.
4673 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4674 The input is 4 vectors each containing 8 elements. We assign a number to
4675 each element, the input sequence is:
4677 1st vec: 0 1 2 3 4 5 6 7
4678 2nd vec: 8 9 10 11 12 13 14 15
4679 3rd vec: 16 17 18 19 20 21 22 23
4680 4th vec: 24 25 26 27 28 29 30 31
4682 The output sequence should be:
4684 1st vec: 0 8 16 24 1 9 17 25
4685 2nd vec: 2 10 18 26 3 11 19 27
4686 3rd vec: 4 12 20 28 5 13 21 30
4687 4th vec: 6 14 22 30 7 15 23 31
4689 i.e., we interleave the contents of the four vectors in their order.
4691 We use interleave_high/low instructions to create such output. The input of
4692 each interleave_high/low operation is two vectors:
4693 1st vec 2nd vec
4694 0 1 2 3 4 5 6 7
4695 the even elements of the result vector are obtained left-to-right from the
4696 high/low elements of the first vector. The odd elements of the result are
4697 obtained left-to-right from the high/low elements of the second vector.
4698 The output of interleave_high will be: 0 4 1 5
4699 and of interleave_low: 2 6 3 7
4702 The permutation is done in log LENGTH stages. In each stage interleave_high
4703 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4704 where the first argument is taken from the first half of DR_CHAIN and the
4705 second argument from it's second half.
4706 In our example,
4708 I1: interleave_high (1st vec, 3rd vec)
4709 I2: interleave_low (1st vec, 3rd vec)
4710 I3: interleave_high (2nd vec, 4th vec)
4711 I4: interleave_low (2nd vec, 4th vec)
4713 The output for the first stage is:
4715 I1: 0 16 1 17 2 18 3 19
4716 I2: 4 20 5 21 6 22 7 23
4717 I3: 8 24 9 25 10 26 11 27
4718 I4: 12 28 13 29 14 30 15 31
4720 The output of the second stage, i.e. the final result is:
4722 I1: 0 8 16 24 1 9 17 25
4723 I2: 2 10 18 26 3 11 19 27
4724 I3: 4 12 20 28 5 13 21 30
4725 I4: 6 14 22 30 7 15 23 31. */
4727 void
4728 vect_permute_store_chain (vec<tree> dr_chain,
4729 unsigned int length,
4730 gimple *stmt,
4731 gimple_stmt_iterator *gsi,
4732 vec<tree> *result_chain)
4734 tree vect1, vect2, high, low;
4735 gimple *perm_stmt;
4736 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4737 tree perm_mask_low, perm_mask_high;
4738 tree data_ref;
4739 tree perm3_mask_low, perm3_mask_high;
4740 unsigned int i, n, log_length = exact_log2 (length);
4741 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4742 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4744 result_chain->quick_grow (length);
4745 memcpy (result_chain->address (), dr_chain.address (),
4746 length * sizeof (tree));
4748 if (length == 3)
4750 unsigned int j0 = 0, j1 = 0, j2 = 0;
4752 for (j = 0; j < 3; j++)
4754 int nelt0 = ((3 - j) * nelt) % 3;
4755 int nelt1 = ((3 - j) * nelt + 1) % 3;
4756 int nelt2 = ((3 - j) * nelt + 2) % 3;
4758 for (i = 0; i < nelt; i++)
4760 if (3 * i + nelt0 < nelt)
4761 sel[3 * i + nelt0] = j0++;
4762 if (3 * i + nelt1 < nelt)
4763 sel[3 * i + nelt1] = nelt + j1++;
4764 if (3 * i + nelt2 < nelt)
4765 sel[3 * i + nelt2] = 0;
4767 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4769 for (i = 0; i < nelt; i++)
4771 if (3 * i + nelt0 < nelt)
4772 sel[3 * i + nelt0] = 3 * i + nelt0;
4773 if (3 * i + nelt1 < nelt)
4774 sel[3 * i + nelt1] = 3 * i + nelt1;
4775 if (3 * i + nelt2 < nelt)
4776 sel[3 * i + nelt2] = nelt + j2++;
4778 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4780 vect1 = dr_chain[0];
4781 vect2 = dr_chain[1];
4783 /* Create interleaving stmt:
4784 low = VEC_PERM_EXPR <vect1, vect2,
4785 {j, nelt, *, j + 1, nelt + j + 1, *,
4786 j + 2, nelt + j + 2, *, ...}> */
4787 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4788 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4789 vect2, perm3_mask_low);
4790 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4792 vect1 = data_ref;
4793 vect2 = dr_chain[2];
4794 /* Create interleaving stmt:
4795 low = VEC_PERM_EXPR <vect1, vect2,
4796 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4797 6, 7, nelt + j + 2, ...}> */
4798 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4799 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4800 vect2, perm3_mask_high);
4801 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4802 (*result_chain)[j] = data_ref;
4805 else
4807 /* If length is not equal to 3 then only power of 2 is supported. */
4808 gcc_assert (exact_log2 (length) != -1);
4810 for (i = 0, n = nelt / 2; i < n; i++)
4812 sel[i * 2] = i;
4813 sel[i * 2 + 1] = i + nelt;
4815 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4817 for (i = 0; i < nelt; i++)
4818 sel[i] += nelt / 2;
4819 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4821 for (i = 0, n = log_length; i < n; i++)
4823 for (j = 0; j < length/2; j++)
4825 vect1 = dr_chain[j];
4826 vect2 = dr_chain[j+length/2];
4828 /* Create interleaving stmt:
4829 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4830 ...}> */
4831 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4832 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4833 vect2, perm_mask_high);
4834 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4835 (*result_chain)[2*j] = high;
4837 /* Create interleaving stmt:
4838 low = VEC_PERM_EXPR <vect1, vect2,
4839 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4840 ...}> */
4841 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4842 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4843 vect2, perm_mask_low);
4844 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4845 (*result_chain)[2*j+1] = low;
4847 memcpy (dr_chain.address (), result_chain->address (),
4848 length * sizeof (tree));
4853 /* Function vect_setup_realignment
4855 This function is called when vectorizing an unaligned load using
4856 the dr_explicit_realign[_optimized] scheme.
4857 This function generates the following code at the loop prolog:
4859 p = initial_addr;
4860 x msq_init = *(floor(p)); # prolog load
4861 realignment_token = call target_builtin;
4862 loop:
4863 x msq = phi (msq_init, ---)
4865 The stmts marked with x are generated only for the case of
4866 dr_explicit_realign_optimized.
4868 The code above sets up a new (vector) pointer, pointing to the first
4869 location accessed by STMT, and a "floor-aligned" load using that pointer.
4870 It also generates code to compute the "realignment-token" (if the relevant
4871 target hook was defined), and creates a phi-node at the loop-header bb
4872 whose arguments are the result of the prolog-load (created by this
4873 function) and the result of a load that takes place in the loop (to be
4874 created by the caller to this function).
4876 For the case of dr_explicit_realign_optimized:
4877 The caller to this function uses the phi-result (msq) to create the
4878 realignment code inside the loop, and sets up the missing phi argument,
4879 as follows:
4880 loop:
4881 msq = phi (msq_init, lsq)
4882 lsq = *(floor(p')); # load in loop
4883 result = realign_load (msq, lsq, realignment_token);
4885 For the case of dr_explicit_realign:
4886 loop:
4887 msq = *(floor(p)); # load in loop
4888 p' = p + (VS-1);
4889 lsq = *(floor(p')); # load in loop
4890 result = realign_load (msq, lsq, realignment_token);
4892 Input:
4893 STMT - (scalar) load stmt to be vectorized. This load accesses
4894 a memory location that may be unaligned.
4895 BSI - place where new code is to be inserted.
4896 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4897 is used.
4899 Output:
4900 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4901 target hook, if defined.
4902 Return value - the result of the loop-header phi node. */
4904 tree
4905 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4906 tree *realignment_token,
4907 enum dr_alignment_support alignment_support_scheme,
4908 tree init_addr,
4909 struct loop **at_loop)
4911 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4912 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4913 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4914 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4915 struct loop *loop = NULL;
4916 edge pe = NULL;
4917 tree scalar_dest = gimple_assign_lhs (stmt);
4918 tree vec_dest;
4919 gimple *inc;
4920 tree ptr;
4921 tree data_ref;
4922 basic_block new_bb;
4923 tree msq_init = NULL_TREE;
4924 tree new_temp;
4925 gphi *phi_stmt;
4926 tree msq = NULL_TREE;
4927 gimple_seq stmts = NULL;
4928 bool inv_p;
4929 bool compute_in_loop = false;
4930 bool nested_in_vect_loop = false;
4931 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4932 struct loop *loop_for_initial_load = NULL;
4934 if (loop_vinfo)
4936 loop = LOOP_VINFO_LOOP (loop_vinfo);
4937 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4940 gcc_assert (alignment_support_scheme == dr_explicit_realign
4941 || alignment_support_scheme == dr_explicit_realign_optimized);
4943 /* We need to generate three things:
4944 1. the misalignment computation
4945 2. the extra vector load (for the optimized realignment scheme).
4946 3. the phi node for the two vectors from which the realignment is
4947 done (for the optimized realignment scheme). */
4949 /* 1. Determine where to generate the misalignment computation.
4951 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4952 calculation will be generated by this function, outside the loop (in the
4953 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4954 caller, inside the loop.
4956 Background: If the misalignment remains fixed throughout the iterations of
4957 the loop, then both realignment schemes are applicable, and also the
4958 misalignment computation can be done outside LOOP. This is because we are
4959 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4960 are a multiple of VS (the Vector Size), and therefore the misalignment in
4961 different vectorized LOOP iterations is always the same.
4962 The problem arises only if the memory access is in an inner-loop nested
4963 inside LOOP, which is now being vectorized using outer-loop vectorization.
4964 This is the only case when the misalignment of the memory access may not
4965 remain fixed throughout the iterations of the inner-loop (as explained in
4966 detail in vect_supportable_dr_alignment). In this case, not only is the
4967 optimized realignment scheme not applicable, but also the misalignment
4968 computation (and generation of the realignment token that is passed to
4969 REALIGN_LOAD) have to be done inside the loop.
4971 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4972 or not, which in turn determines if the misalignment is computed inside
4973 the inner-loop, or outside LOOP. */
4975 if (init_addr != NULL_TREE || !loop_vinfo)
4977 compute_in_loop = true;
4978 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4982 /* 2. Determine where to generate the extra vector load.
4984 For the optimized realignment scheme, instead of generating two vector
4985 loads in each iteration, we generate a single extra vector load in the
4986 preheader of the loop, and in each iteration reuse the result of the
4987 vector load from the previous iteration. In case the memory access is in
4988 an inner-loop nested inside LOOP, which is now being vectorized using
4989 outer-loop vectorization, we need to determine whether this initial vector
4990 load should be generated at the preheader of the inner-loop, or can be
4991 generated at the preheader of LOOP. If the memory access has no evolution
4992 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4993 to be generated inside LOOP (in the preheader of the inner-loop). */
4995 if (nested_in_vect_loop)
4997 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4998 bool invariant_in_outerloop =
4999 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5000 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5002 else
5003 loop_for_initial_load = loop;
5004 if (at_loop)
5005 *at_loop = loop_for_initial_load;
5007 if (loop_for_initial_load)
5008 pe = loop_preheader_edge (loop_for_initial_load);
5010 /* 3. For the case of the optimized realignment, create the first vector
5011 load at the loop preheader. */
5013 if (alignment_support_scheme == dr_explicit_realign_optimized)
5015 /* Create msq_init = *(floor(p1)) in the loop preheader */
5016 gassign *new_stmt;
5018 gcc_assert (!compute_in_loop);
5019 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5020 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5021 NULL_TREE, &init_addr, NULL, &inc,
5022 true, &inv_p);
5023 if (TREE_CODE (ptr) == SSA_NAME)
5024 new_temp = copy_ssa_name (ptr);
5025 else
5026 new_temp = make_ssa_name (TREE_TYPE (ptr));
5027 new_stmt = gimple_build_assign
5028 (new_temp, BIT_AND_EXPR, ptr,
5029 build_int_cst (TREE_TYPE (ptr),
5030 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5031 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5032 gcc_assert (!new_bb);
5033 data_ref
5034 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5035 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5036 new_stmt = gimple_build_assign (vec_dest, data_ref);
5037 new_temp = make_ssa_name (vec_dest, new_stmt);
5038 gimple_assign_set_lhs (new_stmt, new_temp);
5039 if (pe)
5041 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5042 gcc_assert (!new_bb);
5044 else
5045 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5047 msq_init = gimple_assign_lhs (new_stmt);
5050 /* 4. Create realignment token using a target builtin, if available.
5051 It is done either inside the containing loop, or before LOOP (as
5052 determined above). */
5054 if (targetm.vectorize.builtin_mask_for_load)
5056 gcall *new_stmt;
5057 tree builtin_decl;
5059 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5060 if (!init_addr)
5062 /* Generate the INIT_ADDR computation outside LOOP. */
5063 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5064 NULL_TREE, loop);
5065 if (loop)
5067 pe = loop_preheader_edge (loop);
5068 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5069 gcc_assert (!new_bb);
5071 else
5072 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5075 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5076 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5077 vec_dest =
5078 vect_create_destination_var (scalar_dest,
5079 gimple_call_return_type (new_stmt));
5080 new_temp = make_ssa_name (vec_dest, new_stmt);
5081 gimple_call_set_lhs (new_stmt, new_temp);
5083 if (compute_in_loop)
5084 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5085 else
5087 /* Generate the misalignment computation outside LOOP. */
5088 pe = loop_preheader_edge (loop);
5089 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5090 gcc_assert (!new_bb);
5093 *realignment_token = gimple_call_lhs (new_stmt);
5095 /* The result of the CALL_EXPR to this builtin is determined from
5096 the value of the parameter and no global variables are touched
5097 which makes the builtin a "const" function. Requiring the
5098 builtin to have the "const" attribute makes it unnecessary
5099 to call mark_call_clobbered. */
5100 gcc_assert (TREE_READONLY (builtin_decl));
5103 if (alignment_support_scheme == dr_explicit_realign)
5104 return msq;
5106 gcc_assert (!compute_in_loop);
5107 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5110 /* 5. Create msq = phi <msq_init, lsq> in loop */
5112 pe = loop_preheader_edge (containing_loop);
5113 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5114 msq = make_ssa_name (vec_dest);
5115 phi_stmt = create_phi_node (msq, containing_loop->header);
5116 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5118 return msq;
5122 /* Function vect_grouped_load_supported.
5124 Returns TRUE if even and odd permutations are supported,
5125 and FALSE otherwise. */
5127 bool
5128 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5130 machine_mode mode = TYPE_MODE (vectype);
5132 /* vect_permute_load_chain requires the group size to be equal to 3 or
5133 be a power of two. */
5134 if (count != 3 && exact_log2 (count) == -1)
5136 if (dump_enabled_p ())
5137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5138 "the size of the group of accesses"
5139 " is not a power of 2 or not equal to 3\n");
5140 return false;
5143 /* Check that the permutation is supported. */
5144 if (VECTOR_MODE_P (mode))
5146 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5147 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5149 if (count == 3)
5151 unsigned int k;
5152 for (k = 0; k < 3; k++)
5154 for (i = 0; i < nelt; i++)
5155 if (3 * i + k < 2 * nelt)
5156 sel[i] = 3 * i + k;
5157 else
5158 sel[i] = 0;
5159 if (!can_vec_perm_p (mode, false, sel))
5161 if (dump_enabled_p ())
5162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5163 "shuffle of 3 loads is not supported by"
5164 " target\n");
5165 return false;
5167 for (i = 0, j = 0; i < nelt; i++)
5168 if (3 * i + k < 2 * nelt)
5169 sel[i] = i;
5170 else
5171 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5172 if (!can_vec_perm_p (mode, false, sel))
5174 if (dump_enabled_p ())
5175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5176 "shuffle of 3 loads is not supported by"
5177 " target\n");
5178 return false;
5181 return true;
5183 else
5185 /* If length is not equal to 3 then only power of 2 is supported. */
5186 gcc_assert (exact_log2 (count) != -1);
5187 for (i = 0; i < nelt; i++)
5188 sel[i] = i * 2;
5189 if (can_vec_perm_p (mode, false, sel))
5191 for (i = 0; i < nelt; i++)
5192 sel[i] = i * 2 + 1;
5193 if (can_vec_perm_p (mode, false, sel))
5194 return true;
5199 if (dump_enabled_p ())
5200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5201 "extract even/odd not supported by target\n");
5202 return false;
5205 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5206 type VECTYPE. */
5208 bool
5209 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5211 return vect_lanes_optab_supported_p ("vec_load_lanes",
5212 vec_load_lanes_optab,
5213 vectype, count);
5216 /* Function vect_permute_load_chain.
5218 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5219 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5220 the input data correctly. Return the final references for loads in
5221 RESULT_CHAIN.
5223 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5224 The input is 4 vectors each containing 8 elements. We assign a number to each
5225 element, the input sequence is:
5227 1st vec: 0 1 2 3 4 5 6 7
5228 2nd vec: 8 9 10 11 12 13 14 15
5229 3rd vec: 16 17 18 19 20 21 22 23
5230 4th vec: 24 25 26 27 28 29 30 31
5232 The output sequence should be:
5234 1st vec: 0 4 8 12 16 20 24 28
5235 2nd vec: 1 5 9 13 17 21 25 29
5236 3rd vec: 2 6 10 14 18 22 26 30
5237 4th vec: 3 7 11 15 19 23 27 31
5239 i.e., the first output vector should contain the first elements of each
5240 interleaving group, etc.
5242 We use extract_even/odd instructions to create such output. The input of
5243 each extract_even/odd operation is two vectors
5244 1st vec 2nd vec
5245 0 1 2 3 4 5 6 7
5247 and the output is the vector of extracted even/odd elements. The output of
5248 extract_even will be: 0 2 4 6
5249 and of extract_odd: 1 3 5 7
5252 The permutation is done in log LENGTH stages. In each stage extract_even
5253 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5254 their order. In our example,
5256 E1: extract_even (1st vec, 2nd vec)
5257 E2: extract_odd (1st vec, 2nd vec)
5258 E3: extract_even (3rd vec, 4th vec)
5259 E4: extract_odd (3rd vec, 4th vec)
5261 The output for the first stage will be:
5263 E1: 0 2 4 6 8 10 12 14
5264 E2: 1 3 5 7 9 11 13 15
5265 E3: 16 18 20 22 24 26 28 30
5266 E4: 17 19 21 23 25 27 29 31
5268 In order to proceed and create the correct sequence for the next stage (or
5269 for the correct output, if the second stage is the last one, as in our
5270 example), we first put the output of extract_even operation and then the
5271 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5272 The input for the second stage is:
5274 1st vec (E1): 0 2 4 6 8 10 12 14
5275 2nd vec (E3): 16 18 20 22 24 26 28 30
5276 3rd vec (E2): 1 3 5 7 9 11 13 15
5277 4th vec (E4): 17 19 21 23 25 27 29 31
5279 The output of the second stage:
5281 E1: 0 4 8 12 16 20 24 28
5282 E2: 2 6 10 14 18 22 26 30
5283 E3: 1 5 9 13 17 21 25 29
5284 E4: 3 7 11 15 19 23 27 31
5286 And RESULT_CHAIN after reordering:
5288 1st vec (E1): 0 4 8 12 16 20 24 28
5289 2nd vec (E3): 1 5 9 13 17 21 25 29
5290 3rd vec (E2): 2 6 10 14 18 22 26 30
5291 4th vec (E4): 3 7 11 15 19 23 27 31. */
5293 static void
5294 vect_permute_load_chain (vec<tree> dr_chain,
5295 unsigned int length,
5296 gimple *stmt,
5297 gimple_stmt_iterator *gsi,
5298 vec<tree> *result_chain)
5300 tree data_ref, first_vect, second_vect;
5301 tree perm_mask_even, perm_mask_odd;
5302 tree perm3_mask_low, perm3_mask_high;
5303 gimple *perm_stmt;
5304 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5305 unsigned int i, j, log_length = exact_log2 (length);
5306 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5307 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5309 result_chain->quick_grow (length);
5310 memcpy (result_chain->address (), dr_chain.address (),
5311 length * sizeof (tree));
5313 if (length == 3)
5315 unsigned int k;
5317 for (k = 0; k < 3; k++)
5319 for (i = 0; i < nelt; i++)
5320 if (3 * i + k < 2 * nelt)
5321 sel[i] = 3 * i + k;
5322 else
5323 sel[i] = 0;
5324 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5326 for (i = 0, j = 0; i < nelt; i++)
5327 if (3 * i + k < 2 * nelt)
5328 sel[i] = i;
5329 else
5330 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5332 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5334 first_vect = dr_chain[0];
5335 second_vect = dr_chain[1];
5337 /* Create interleaving stmt (low part of):
5338 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5339 ...}> */
5340 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5341 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5342 second_vect, perm3_mask_low);
5343 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5345 /* Create interleaving stmt (high part of):
5346 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5347 ...}> */
5348 first_vect = data_ref;
5349 second_vect = dr_chain[2];
5350 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5351 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5352 second_vect, perm3_mask_high);
5353 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5354 (*result_chain)[k] = data_ref;
5357 else
5359 /* If length is not equal to 3 then only power of 2 is supported. */
5360 gcc_assert (exact_log2 (length) != -1);
5362 for (i = 0; i < nelt; ++i)
5363 sel[i] = i * 2;
5364 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5366 for (i = 0; i < nelt; ++i)
5367 sel[i] = i * 2 + 1;
5368 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5370 for (i = 0; i < log_length; i++)
5372 for (j = 0; j < length; j += 2)
5374 first_vect = dr_chain[j];
5375 second_vect = dr_chain[j+1];
5377 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5378 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5379 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5380 first_vect, second_vect,
5381 perm_mask_even);
5382 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5383 (*result_chain)[j/2] = data_ref;
5385 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5386 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5387 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5388 first_vect, second_vect,
5389 perm_mask_odd);
5390 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5391 (*result_chain)[j/2+length/2] = data_ref;
5393 memcpy (dr_chain.address (), result_chain->address (),
5394 length * sizeof (tree));
5399 /* Function vect_shift_permute_load_chain.
5401 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5402 sequence of stmts to reorder the input data accordingly.
5403 Return the final references for loads in RESULT_CHAIN.
5404 Return true if successed, false otherwise.
5406 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5407 The input is 3 vectors each containing 8 elements. We assign a
5408 number to each element, the input sequence is:
5410 1st vec: 0 1 2 3 4 5 6 7
5411 2nd vec: 8 9 10 11 12 13 14 15
5412 3rd vec: 16 17 18 19 20 21 22 23
5414 The output sequence should be:
5416 1st vec: 0 3 6 9 12 15 18 21
5417 2nd vec: 1 4 7 10 13 16 19 22
5418 3rd vec: 2 5 8 11 14 17 20 23
5420 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5422 First we shuffle all 3 vectors to get correct elements order:
5424 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5425 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5426 3rd vec: (16 19 22) (17 20 23) (18 21)
5428 Next we unite and shift vector 3 times:
5430 1st step:
5431 shift right by 6 the concatenation of:
5432 "1st vec" and "2nd vec"
5433 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5434 "2nd vec" and "3rd vec"
5435 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5436 "3rd vec" and "1st vec"
5437 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5438 | New vectors |
5440 So that now new vectors are:
5442 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5443 2nd vec: (10 13) (16 19 22) (17 20 23)
5444 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5446 2nd step:
5447 shift right by 5 the concatenation of:
5448 "1st vec" and "3rd vec"
5449 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5450 "2nd vec" and "1st vec"
5451 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5452 "3rd vec" and "2nd vec"
5453 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5454 | New vectors |
5456 So that now new vectors are:
5458 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5459 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5460 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5462 3rd step:
5463 shift right by 5 the concatenation of:
5464 "1st vec" and "1st vec"
5465 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5466 shift right by 3 the concatenation of:
5467 "2nd vec" and "2nd vec"
5468 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5469 | New vectors |
5471 So that now all vectors are READY:
5472 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5473 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5474 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5476 This algorithm is faster than one in vect_permute_load_chain if:
5477 1. "shift of a concatination" is faster than general permutation.
5478 This is usually so.
5479 2. The TARGET machine can't execute vector instructions in parallel.
5480 This is because each step of the algorithm depends on previous.
5481 The algorithm in vect_permute_load_chain is much more parallel.
5483 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5486 static bool
5487 vect_shift_permute_load_chain (vec<tree> dr_chain,
5488 unsigned int length,
5489 gimple *stmt,
5490 gimple_stmt_iterator *gsi,
5491 vec<tree> *result_chain)
5493 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5494 tree perm2_mask1, perm2_mask2, perm3_mask;
5495 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5496 gimple *perm_stmt;
5498 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5499 unsigned int i;
5500 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5501 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5502 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5503 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5505 result_chain->quick_grow (length);
5506 memcpy (result_chain->address (), dr_chain.address (),
5507 length * sizeof (tree));
5509 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5511 unsigned int j, log_length = exact_log2 (length);
5512 for (i = 0; i < nelt / 2; ++i)
5513 sel[i] = i * 2;
5514 for (i = 0; i < nelt / 2; ++i)
5515 sel[nelt / 2 + i] = i * 2 + 1;
5516 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5518 if (dump_enabled_p ())
5519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5520 "shuffle of 2 fields structure is not \
5521 supported by target\n");
5522 return false;
5524 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5526 for (i = 0; i < nelt / 2; ++i)
5527 sel[i] = i * 2 + 1;
5528 for (i = 0; i < nelt / 2; ++i)
5529 sel[nelt / 2 + i] = i * 2;
5530 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5532 if (dump_enabled_p ())
5533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5534 "shuffle of 2 fields structure is not \
5535 supported by target\n");
5536 return false;
5538 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5540 /* Generating permutation constant to shift all elements.
5541 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5542 for (i = 0; i < nelt; i++)
5543 sel[i] = nelt / 2 + i;
5544 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5546 if (dump_enabled_p ())
5547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5548 "shift permutation is not supported by target\n");
5549 return false;
5551 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5553 /* Generating permutation constant to select vector from 2.
5554 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5555 for (i = 0; i < nelt / 2; i++)
5556 sel[i] = i;
5557 for (i = nelt / 2; i < nelt; i++)
5558 sel[i] = nelt + i;
5559 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5561 if (dump_enabled_p ())
5562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5563 "select is not supported by target\n");
5564 return false;
5566 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5568 for (i = 0; i < log_length; i++)
5570 for (j = 0; j < length; j += 2)
5572 first_vect = dr_chain[j];
5573 second_vect = dr_chain[j + 1];
5575 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5576 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5577 first_vect, first_vect,
5578 perm2_mask1);
5579 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5580 vect[0] = data_ref;
5582 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5583 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5584 second_vect, second_vect,
5585 perm2_mask2);
5586 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5587 vect[1] = data_ref;
5589 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5590 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5591 vect[0], vect[1], shift1_mask);
5592 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5593 (*result_chain)[j/2 + length/2] = data_ref;
5595 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5596 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5597 vect[0], vect[1], select_mask);
5598 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5599 (*result_chain)[j/2] = data_ref;
5601 memcpy (dr_chain.address (), result_chain->address (),
5602 length * sizeof (tree));
5604 return true;
5606 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5608 unsigned int k = 0, l = 0;
5610 /* Generating permutation constant to get all elements in rigth order.
5611 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5612 for (i = 0; i < nelt; i++)
5614 if (3 * k + (l % 3) >= nelt)
5616 k = 0;
5617 l += (3 - (nelt % 3));
5619 sel[i] = 3 * k + (l % 3);
5620 k++;
5622 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5624 if (dump_enabled_p ())
5625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5626 "shuffle of 3 fields structure is not \
5627 supported by target\n");
5628 return false;
5630 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5632 /* Generating permutation constant to shift all elements.
5633 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5634 for (i = 0; i < nelt; i++)
5635 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5636 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5638 if (dump_enabled_p ())
5639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5640 "shift permutation is not supported by target\n");
5641 return false;
5643 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5645 /* Generating permutation constant to shift all elements.
5646 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5647 for (i = 0; i < nelt; i++)
5648 sel[i] = 2 * (nelt / 3) + 1 + i;
5649 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5651 if (dump_enabled_p ())
5652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5653 "shift permutation is not supported by target\n");
5654 return false;
5656 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5658 /* Generating permutation constant to shift all elements.
5659 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5660 for (i = 0; i < nelt; i++)
5661 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5662 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5664 if (dump_enabled_p ())
5665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5666 "shift permutation is not supported by target\n");
5667 return false;
5669 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5671 /* Generating permutation constant to shift all elements.
5672 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5673 for (i = 0; i < nelt; i++)
5674 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5675 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5677 if (dump_enabled_p ())
5678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5679 "shift permutation is not supported by target\n");
5680 return false;
5682 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5684 for (k = 0; k < 3; k++)
5686 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5687 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5688 dr_chain[k], dr_chain[k],
5689 perm3_mask);
5690 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5691 vect[k] = data_ref;
5694 for (k = 0; k < 3; k++)
5696 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5697 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5698 vect[k % 3], vect[(k + 1) % 3],
5699 shift1_mask);
5700 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5701 vect_shift[k] = data_ref;
5704 for (k = 0; k < 3; k++)
5706 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5707 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5708 vect_shift[(4 - k) % 3],
5709 vect_shift[(3 - k) % 3],
5710 shift2_mask);
5711 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5712 vect[k] = data_ref;
5715 (*result_chain)[3 - (nelt % 3)] = vect[2];
5717 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5718 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5719 vect[0], shift3_mask);
5720 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5721 (*result_chain)[nelt % 3] = data_ref;
5723 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5724 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5725 vect[1], shift4_mask);
5726 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5727 (*result_chain)[0] = data_ref;
5728 return true;
5730 return false;
5733 /* Function vect_transform_grouped_load.
5735 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5736 to perform their permutation and ascribe the result vectorized statements to
5737 the scalar statements.
5740 void
5741 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5742 gimple_stmt_iterator *gsi)
5744 machine_mode mode;
5745 vec<tree> result_chain = vNULL;
5747 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5748 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5749 vectors, that are ready for vector computation. */
5750 result_chain.create (size);
5752 /* If reassociation width for vector type is 2 or greater target machine can
5753 execute 2 or more vector instructions in parallel. Otherwise try to
5754 get chain for loads group using vect_shift_permute_load_chain. */
5755 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5756 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5757 || exact_log2 (size) != -1
5758 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5759 gsi, &result_chain))
5760 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5761 vect_record_grouped_load_vectors (stmt, result_chain);
5762 result_chain.release ();
5765 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5766 generated as part of the vectorization of STMT. Assign the statement
5767 for each vector to the associated scalar statement. */
5769 void
5770 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5772 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5773 gimple *next_stmt, *new_stmt;
5774 unsigned int i, gap_count;
5775 tree tmp_data_ref;
5777 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5778 Since we scan the chain starting from it's first node, their order
5779 corresponds the order of data-refs in RESULT_CHAIN. */
5780 next_stmt = first_stmt;
5781 gap_count = 1;
5782 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5784 if (!next_stmt)
5785 break;
5787 /* Skip the gaps. Loads created for the gaps will be removed by dead
5788 code elimination pass later. No need to check for the first stmt in
5789 the group, since it always exists.
5790 GROUP_GAP is the number of steps in elements from the previous
5791 access (if there is no gap GROUP_GAP is 1). We skip loads that
5792 correspond to the gaps. */
5793 if (next_stmt != first_stmt
5794 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5796 gap_count++;
5797 continue;
5800 while (next_stmt)
5802 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5803 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5804 copies, and we put the new vector statement in the first available
5805 RELATED_STMT. */
5806 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5807 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5808 else
5810 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5812 gimple *prev_stmt =
5813 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5814 gimple *rel_stmt =
5815 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5816 while (rel_stmt)
5818 prev_stmt = rel_stmt;
5819 rel_stmt =
5820 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5823 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5824 new_stmt;
5828 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5829 gap_count = 1;
5830 /* If NEXT_STMT accesses the same DR as the previous statement,
5831 put the same TMP_DATA_REF as its vectorized statement; otherwise
5832 get the next data-ref from RESULT_CHAIN. */
5833 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5834 break;
5839 /* Function vect_force_dr_alignment_p.
5841 Returns whether the alignment of a DECL can be forced to be aligned
5842 on ALIGNMENT bit boundary. */
5844 bool
5845 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5847 if (TREE_CODE (decl) != VAR_DECL)
5848 return false;
5850 if (decl_in_symtab_p (decl)
5851 && !symtab_node::get (decl)->can_increase_alignment_p ())
5852 return false;
5854 if (TREE_STATIC (decl))
5855 return (alignment <= MAX_OFILE_ALIGNMENT);
5856 else
5857 return (alignment <= MAX_STACK_ALIGNMENT);
5861 /* Return whether the data reference DR is supported with respect to its
5862 alignment.
5863 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5864 it is aligned, i.e., check if it is possible to vectorize it with different
5865 alignment. */
5867 enum dr_alignment_support
5868 vect_supportable_dr_alignment (struct data_reference *dr,
5869 bool check_aligned_accesses)
5871 gimple *stmt = DR_STMT (dr);
5872 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5873 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5874 machine_mode mode = TYPE_MODE (vectype);
5875 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5876 struct loop *vect_loop = NULL;
5877 bool nested_in_vect_loop = false;
5879 if (aligned_access_p (dr) && !check_aligned_accesses)
5880 return dr_aligned;
5882 /* For now assume all conditional loads/stores support unaligned
5883 access without any special code. */
5884 if (is_gimple_call (stmt)
5885 && gimple_call_internal_p (stmt)
5886 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5887 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5888 return dr_unaligned_supported;
5890 if (loop_vinfo)
5892 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5893 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5896 /* Possibly unaligned access. */
5898 /* We can choose between using the implicit realignment scheme (generating
5899 a misaligned_move stmt) and the explicit realignment scheme (generating
5900 aligned loads with a REALIGN_LOAD). There are two variants to the
5901 explicit realignment scheme: optimized, and unoptimized.
5902 We can optimize the realignment only if the step between consecutive
5903 vector loads is equal to the vector size. Since the vector memory
5904 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5905 is guaranteed that the misalignment amount remains the same throughout the
5906 execution of the vectorized loop. Therefore, we can create the
5907 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5908 at the loop preheader.
5910 However, in the case of outer-loop vectorization, when vectorizing a
5911 memory access in the inner-loop nested within the LOOP that is now being
5912 vectorized, while it is guaranteed that the misalignment of the
5913 vectorized memory access will remain the same in different outer-loop
5914 iterations, it is *not* guaranteed that is will remain the same throughout
5915 the execution of the inner-loop. This is because the inner-loop advances
5916 with the original scalar step (and not in steps of VS). If the inner-loop
5917 step happens to be a multiple of VS, then the misalignment remains fixed
5918 and we can use the optimized realignment scheme. For example:
5920 for (i=0; i<N; i++)
5921 for (j=0; j<M; j++)
5922 s += a[i+j];
5924 When vectorizing the i-loop in the above example, the step between
5925 consecutive vector loads is 1, and so the misalignment does not remain
5926 fixed across the execution of the inner-loop, and the realignment cannot
5927 be optimized (as illustrated in the following pseudo vectorized loop):
5929 for (i=0; i<N; i+=4)
5930 for (j=0; j<M; j++){
5931 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5932 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5933 // (assuming that we start from an aligned address).
5936 We therefore have to use the unoptimized realignment scheme:
5938 for (i=0; i<N; i+=4)
5939 for (j=k; j<M; j+=4)
5940 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5941 // that the misalignment of the initial address is
5942 // 0).
5944 The loop can then be vectorized as follows:
5946 for (k=0; k<4; k++){
5947 rt = get_realignment_token (&vp[k]);
5948 for (i=0; i<N; i+=4){
5949 v1 = vp[i+k];
5950 for (j=k; j<M; j+=4){
5951 v2 = vp[i+j+VS-1];
5952 va = REALIGN_LOAD <v1,v2,rt>;
5953 vs += va;
5954 v1 = v2;
5957 } */
5959 if (DR_IS_READ (dr))
5961 bool is_packed = false;
5962 tree type = (TREE_TYPE (DR_REF (dr)));
5964 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5965 && (!targetm.vectorize.builtin_mask_for_load
5966 || targetm.vectorize.builtin_mask_for_load ()))
5968 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5969 if ((nested_in_vect_loop
5970 && (TREE_INT_CST_LOW (DR_STEP (dr))
5971 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5972 || !loop_vinfo)
5973 return dr_explicit_realign;
5974 else
5975 return dr_explicit_realign_optimized;
5977 if (!known_alignment_for_access_p (dr))
5978 is_packed = not_size_aligned (DR_REF (dr));
5980 if ((TYPE_USER_ALIGN (type) && !is_packed)
5981 || targetm.vectorize.
5982 support_vector_misalignment (mode, type,
5983 DR_MISALIGNMENT (dr), is_packed))
5984 /* Can't software pipeline the loads, but can at least do them. */
5985 return dr_unaligned_supported;
5987 else
5989 bool is_packed = false;
5990 tree type = (TREE_TYPE (DR_REF (dr)));
5992 if (!known_alignment_for_access_p (dr))
5993 is_packed = not_size_aligned (DR_REF (dr));
5995 if ((TYPE_USER_ALIGN (type) && !is_packed)
5996 || targetm.vectorize.
5997 support_vector_misalignment (mode, type,
5998 DR_MISALIGNMENT (dr), is_packed))
5999 return dr_unaligned_supported;
6002 /* Unsupported. */
6003 return dr_unaligned_unsupported;