2016-01-15 Vladimir Makarov <vmakarov@redhat.com>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob3605fa4a7a4cbeb80388bdb7d8b35d6959138ea1
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
270 return true;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 return true;
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
344 if (dist == 0)
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
390 continue;
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
444 return true;
447 return false;
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
459 unsigned int i;
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
479 return true;
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
503 if (dra == drb)
504 return false;
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
554 group. */
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
560 continue;
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 continue;
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 if (!dr_b)
576 return false;
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
585 continue;
586 unsigned i;
587 gimple *store;
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
597 return false;
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
607 return false;
609 free_dependence_relation (ddr);
612 return true;
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632 store = NULL;
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
636 if (store)
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 return false;
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
647 bool res = true;
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
651 slp_tree load;
652 unsigned int i;
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
655 store
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
659 res = false;
660 break;
663 /* Unset the visited flag. */
664 if (store)
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
668 return res;
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
675 Output:
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
691 tree vectype;
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
694 tree aligned_to;
695 unsigned HOST_WIDE_INT alignment;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE, vect_location,
699 "vect_compute_data_ref_alignment:\n");
701 if (loop_vinfo)
702 loop = LOOP_VINFO_LOOP (loop_vinfo);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr)))
708 misalign = DR_INIT (dr);
709 aligned_to = DR_ALIGNED_TO (dr);
710 base_addr = DR_BASE_ADDRESS (dr);
711 vectype = STMT_VINFO_VECTYPE (stmt_info);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop && nested_in_vect_loop_p (loop, stmt))
721 tree step = DR_STEP (dr);
723 if (tree_fits_shwi_p (step)
724 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "inner step divides the vector-size.\n");
729 misalign = STMT_VINFO_DR_INIT (stmt_info);
730 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
731 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
733 else
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "inner step doesn't divide the vector-size.\n");
738 misalign = NULL_TREE;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
746 else
748 tree step = DR_STEP (dr);
749 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
751 if (tree_fits_shwi_p (step)
752 && ((tree_to_shwi (step) * vf)
753 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "step doesn't divide the vector-size.\n");
758 misalign = NULL_TREE;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
764 base = ref;
765 while (handled_component_p (base))
766 base = TREE_OPERAND (base, 0);
767 if (TREE_CODE (base) == MEM_REF)
768 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
770 unsigned int base_alignment = get_object_alignment (base);
772 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
773 DR_VECT_AUX (dr)->base_element_aligned = true;
775 alignment = TYPE_ALIGN_UNIT (vectype);
777 if ((compare_tree_int (aligned_to, alignment) < 0)
778 || !misalign)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
790 if (base_alignment < TYPE_ALIGN (vectype))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base, 1))
795 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
796 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
805 dump_printf (MSG_NOTE, "\n");
807 return true;
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
817 dump_printf (MSG_NOTE, "\n");
820 DR_VECT_AUX (dr)->base_decl = base;
821 DR_VECT_AUX (dr)->base_misaligned = true;
822 DR_VECT_AUX (dr)->base_element_aligned = true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
830 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
834 /* PLUS because DR_STEP was negative. */
835 misalign = size_binop (PLUS_EXPR, misalign, offset);
838 SET_DR_MISALIGNMENT (dr,
839 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
849 return true;
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
861 static void
862 vect_update_misalignment_for_peel (struct data_reference *dr,
863 struct data_reference *dr_peel, int npeel)
865 unsigned int i;
866 vec<dr_p> same_align_drs;
867 struct data_reference *current_dr;
868 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
869 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
870 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
871 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
876 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
878 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
882 same_align_drs
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
884 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
886 if (current_dr != dr)
887 continue;
888 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
889 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
890 SET_DR_MISALIGNMENT (dr, 0);
891 return;
894 if (known_alignment_for_access_p (dr)
895 && known_alignment_for_access_p (dr_peel))
897 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
898 int misal = DR_MISALIGNMENT (dr);
899 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
900 misal += negative ? -npeel * dr_size : npeel * dr_size;
901 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
902 SET_DR_MISALIGNMENT (dr, misal);
903 return;
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
916 static bool
917 verify_data_ref_alignment (data_reference_p dr)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr, false);
921 if (!supportable_dr_alignment)
923 if (dump_enabled_p ())
925 if (DR_IS_READ (dr))
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
927 "not vectorized: unsupported unaligned load.");
928 else
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "not vectorized: unsupported unaligned "
931 "store.");
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
934 DR_REF (dr));
935 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
937 return false;
940 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location,
942 "Vectorizing an unaligned access.\n");
944 return true;
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
952 bool
953 vect_verify_datarefs_alignment (loop_vec_info vinfo)
955 vec<data_reference_p> datarefs = vinfo->datarefs;
956 struct data_reference *dr;
957 unsigned int i;
959 FOR_EACH_VEC_ELT (datarefs, i, dr)
961 gimple *stmt = DR_STMT (dr);
962 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
965 continue;
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
969 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
970 continue;
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
976 continue;
978 if (! verify_data_ref_alignment (dr))
979 return false;
982 return true;
985 /* Given an memory reference EXP return whether its alignment is less
986 than its size. */
988 static bool
989 not_size_aligned (tree exp)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
992 return true;
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
995 > get_object_alignment (exp));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1003 static bool
1004 vector_alignment_reachable_p (struct data_reference *dr)
1006 gimple *stmt = DR_STMT (dr);
1007 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size, mis_in_elements;
1016 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr))
1020 return false;
1022 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1023 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1025 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1026 return false;
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1033 HOST_WIDE_INT elmsize =
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE, vect_location,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1039 dump_printf (MSG_NOTE,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1042 if (DR_MISALIGNMENT (dr) % elmsize)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "data size does not divide the misalignment.\n");
1047 return false;
1051 if (!known_alignment_for_access_p (dr))
1053 tree type = TREE_TYPE (DR_REF (dr));
1054 bool is_packed = not_size_aligned (DR_REF (dr));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Unknown misalignment, is_packed = %d\n",is_packed);
1058 if ((TYPE_USER_ALIGN (type) && !is_packed)
1059 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1060 return true;
1061 else
1062 return false;
1065 return true;
1069 /* Calculate the cost of the memory access represented by DR. */
1071 static void
1072 vect_get_data_access_cost (struct data_reference *dr,
1073 unsigned int *inside_cost,
1074 unsigned int *outside_cost,
1075 stmt_vector_for_cost *body_cost_vec)
1077 gimple *stmt = DR_STMT (dr);
1078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1079 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1081 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1082 int ncopies = vf / nunits;
1084 if (DR_IS_READ (dr))
1085 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1086 NULL, body_cost_vec, false);
1087 else
1088 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1097 typedef struct _vect_peel_info
1099 int npeel;
1100 struct data_reference *dr;
1101 unsigned int count;
1102 } *vect_peel_info;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info;
1107 unsigned int inside_cost;
1108 unsigned int outside_cost;
1109 stmt_vector_for_cost body_cost_vec;
1110 } *vect_peel_extended_info;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1117 static inline hashval_t hash (const _vect_peel_info *);
1118 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1121 inline hashval_t
1122 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1124 return (hashval_t) peel_info->npeel;
1127 inline bool
1128 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1130 return (a->npeel == b->npeel);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1136 static void
1137 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1138 loop_vec_info loop_vinfo, struct data_reference *dr,
1139 int npeel)
1141 struct _vect_peel_info elem, *slot;
1142 _vect_peel_info **new_slot;
1143 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1145 elem.npeel = npeel;
1146 slot = peeling_htab->find (&elem);
1147 if (slot)
1148 slot->count++;
1149 else
1151 slot = XNEW (struct _vect_peel_info);
1152 slot->npeel = npeel;
1153 slot->dr = dr;
1154 slot->count = 1;
1155 new_slot = peeling_htab->find_slot (slot, INSERT);
1156 *new_slot = slot;
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1161 slot->count += VECT_MAX_COST;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1170 _vect_peel_extended_info *max)
1172 vect_peel_info elem = *slot;
1174 if (elem->count > max->peel_info.count
1175 || (elem->count == max->peel_info.count
1176 && max->peel_info.npeel > elem->npeel))
1178 max->peel_info.npeel = elem->npeel;
1179 max->peel_info.count = elem->count;
1180 max->peel_info.dr = elem->dr;
1183 return 1;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1192 _vect_peel_extended_info *min)
1194 vect_peel_info elem = *slot;
1195 int save_misalignment, dummy;
1196 unsigned int inside_cost = 0, outside_cost = 0, i;
1197 gimple *stmt = DR_STMT (elem->dr);
1198 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1199 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1200 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1201 struct data_reference *dr;
1202 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1204 prologue_cost_vec.create (2);
1205 body_cost_vec.create (2);
1206 epilogue_cost_vec.create (2);
1208 FOR_EACH_VEC_ELT (datarefs, i, dr)
1210 stmt = DR_STMT (dr);
1211 stmt_info = vinfo_for_stmt (stmt);
1212 /* For interleaving, only the alignment of the first access
1213 matters. */
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1215 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1216 continue;
1218 /* Strided accesses perform only component accesses, alignment is
1219 irrelevant for them. */
1220 if (STMT_VINFO_STRIDED_P (stmt_info)
1221 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1222 continue;
1224 save_misalignment = DR_MISALIGNMENT (dr);
1225 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1226 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1227 &body_cost_vec);
1228 SET_DR_MISALIGNMENT (dr, save_misalignment);
1231 outside_cost += vect_get_known_peeling_cost
1232 (loop_vinfo, elem->npeel, &dummy,
1233 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1234 &prologue_cost_vec, &epilogue_cost_vec);
1236 /* Prologue and epilogue costs are added to the target model later.
1237 These costs depend only on the scalar iteration cost, the
1238 number of peeling iterations finally chosen, and the number of
1239 misaligned statements. So discard the information found here. */
1240 prologue_cost_vec.release ();
1241 epilogue_cost_vec.release ();
1243 if (inside_cost < min->inside_cost
1244 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1246 min->inside_cost = inside_cost;
1247 min->outside_cost = outside_cost;
1248 min->body_cost_vec.release ();
1249 min->body_cost_vec = body_cost_vec;
1250 min->peel_info.dr = elem->dr;
1251 min->peel_info.npeel = elem->npeel;
1253 else
1254 body_cost_vec.release ();
1256 return 1;
1260 /* Choose best peeling option by traversing peeling hash table and either
1261 choosing an option with the lowest cost (if cost model is enabled) or the
1262 option that aligns as many accesses as possible. */
1264 static struct data_reference *
1265 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1266 loop_vec_info loop_vinfo,
1267 unsigned int *npeel,
1268 stmt_vector_for_cost *body_cost_vec)
1270 struct _vect_peel_extended_info res;
1272 res.peel_info.dr = NULL;
1273 res.body_cost_vec = stmt_vector_for_cost ();
1275 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1277 res.inside_cost = INT_MAX;
1278 res.outside_cost = INT_MAX;
1279 peeling_htab->traverse <_vect_peel_extended_info *,
1280 vect_peeling_hash_get_lowest_cost> (&res);
1282 else
1284 res.peel_info.count = 0;
1285 peeling_htab->traverse <_vect_peel_extended_info *,
1286 vect_peeling_hash_get_most_frequent> (&res);
1289 *npeel = res.peel_info.npeel;
1290 *body_cost_vec = res.body_cost_vec;
1291 return res.peel_info.dr;
1295 /* Function vect_enhance_data_refs_alignment
1297 This pass will use loop versioning and loop peeling in order to enhance
1298 the alignment of data references in the loop.
1300 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1301 original loop is to be vectorized. Any other loops that are created by
1302 the transformations performed in this pass - are not supposed to be
1303 vectorized. This restriction will be relaxed.
1305 This pass will require a cost model to guide it whether to apply peeling
1306 or versioning or a combination of the two. For example, the scheme that
1307 intel uses when given a loop with several memory accesses, is as follows:
1308 choose one memory access ('p') which alignment you want to force by doing
1309 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1310 other accesses are not necessarily aligned, or (2) use loop versioning to
1311 generate one loop in which all accesses are aligned, and another loop in
1312 which only 'p' is necessarily aligned.
1314 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1315 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1316 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1318 Devising a cost model is the most critical aspect of this work. It will
1319 guide us on which access to peel for, whether to use loop versioning, how
1320 many versions to create, etc. The cost model will probably consist of
1321 generic considerations as well as target specific considerations (on
1322 powerpc for example, misaligned stores are more painful than misaligned
1323 loads).
1325 Here are the general steps involved in alignment enhancements:
1327 -- original loop, before alignment analysis:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- After vect_compute_data_refs_alignment:
1334 for (i=0; i<N; i++){
1335 x = q[i]; # DR_MISALIGNMENT(q) = 3
1336 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1339 -- Possibility 1: we do loop versioning:
1340 if (p is aligned) {
1341 for (i=0; i<N; i++){ # loop 1A
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = 0
1346 else {
1347 for (i=0; i<N; i++){ # loop 1B
1348 x = q[i]; # DR_MISALIGNMENT(q) = 3
1349 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1353 -- Possibility 2: we do loop peeling:
1354 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1355 x = q[i];
1356 p[i] = y;
1358 for (i = 3; i < N; i++){ # loop 2A
1359 x = q[i]; # DR_MISALIGNMENT(q) = 0
1360 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1363 -- Possibility 3: combination of loop peeling and versioning:
1364 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1365 x = q[i];
1366 p[i] = y;
1368 if (p is aligned) {
1369 for (i = 3; i<N; i++){ # loop 3A
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = 0
1374 else {
1375 for (i = 3; i<N; i++){ # loop 3B
1376 x = q[i]; # DR_MISALIGNMENT(q) = 0
1377 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1381 These loops are later passed to loop_transform to be vectorized. The
1382 vectorizer will use the alignment information to guide the transformation
1383 (whether to generate regular loads/stores, or with special handling for
1384 misalignment). */
1386 bool
1387 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1389 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1390 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1391 enum dr_alignment_support supportable_dr_alignment;
1392 struct data_reference *dr0 = NULL, *first_store = NULL;
1393 struct data_reference *dr;
1394 unsigned int i, j;
1395 bool do_peeling = false;
1396 bool do_versioning = false;
1397 bool stat;
1398 gimple *stmt;
1399 stmt_vec_info stmt_info;
1400 unsigned int npeel = 0;
1401 bool all_misalignments_unknown = true;
1402 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1403 unsigned possible_npeel_number = 1;
1404 tree vectype;
1405 unsigned int nelements, mis, same_align_drs_max = 0;
1406 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1407 hash_table<peel_info_hasher> peeling_htab (1);
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "=== vect_enhance_data_refs_alignment ===\n");
1413 /* Reset data so we can safely be called multiple times. */
1414 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1415 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1417 /* While cost model enhancements are expected in the future, the high level
1418 view of the code at this time is as follows:
1420 A) If there is a misaligned access then see if peeling to align
1421 this access can make all data references satisfy
1422 vect_supportable_dr_alignment. If so, update data structures
1423 as needed and return true.
1425 B) If peeling wasn't possible and there is a data reference with an
1426 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1427 then see if loop versioning checks can be used to make all data
1428 references satisfy vect_supportable_dr_alignment. If so, update
1429 data structures as needed and return true.
1431 C) If neither peeling nor versioning were successful then return false if
1432 any data reference does not satisfy vect_supportable_dr_alignment.
1434 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1436 Note, Possibility 3 above (which is peeling and versioning together) is not
1437 being done at this time. */
1439 /* (1) Peeling to force alignment. */
1441 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1442 Considerations:
1443 + How many accesses will become aligned due to the peeling
1444 - How many accesses will become unaligned due to the peeling,
1445 and the cost of misaligned accesses.
1446 - The cost of peeling (the extra runtime checks, the increase
1447 in code size). */
1449 FOR_EACH_VEC_ELT (datarefs, i, dr)
1451 stmt = DR_STMT (dr);
1452 stmt_info = vinfo_for_stmt (stmt);
1454 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1455 continue;
1457 /* For interleaving, only the alignment of the first access
1458 matters. */
1459 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1460 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1461 continue;
1463 /* For invariant accesses there is nothing to enhance. */
1464 if (integer_zerop (DR_STEP (dr)))
1465 continue;
1467 /* Strided accesses perform only component accesses, alignment is
1468 irrelevant for them. */
1469 if (STMT_VINFO_STRIDED_P (stmt_info)
1470 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1471 continue;
1473 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1474 do_peeling = vector_alignment_reachable_p (dr);
1475 if (do_peeling)
1477 if (known_alignment_for_access_p (dr))
1479 unsigned int npeel_tmp;
1480 bool negative = tree_int_cst_compare (DR_STEP (dr),
1481 size_zero_node) < 0;
1483 /* Save info about DR in the hash table. */
1484 vectype = STMT_VINFO_VECTYPE (stmt_info);
1485 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1486 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1487 TREE_TYPE (DR_REF (dr))));
1488 npeel_tmp = (negative
1489 ? (mis - nelements) : (nelements - mis))
1490 & (nelements - 1);
1492 /* For multiple types, it is possible that the bigger type access
1493 will have more than one peeling option. E.g., a loop with two
1494 types: one of size (vector size / 4), and the other one of
1495 size (vector size / 8). Vectorization factor will 8. If both
1496 access are misaligned by 3, the first one needs one scalar
1497 iteration to be aligned, and the second one needs 5. But the
1498 the first one will be aligned also by peeling 5 scalar
1499 iterations, and in that case both accesses will be aligned.
1500 Hence, except for the immediate peeling amount, we also want
1501 to try to add full vector size, while we don't exceed
1502 vectorization factor.
1503 We do this automtically for cost model, since we calculate cost
1504 for every peeling option. */
1505 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1507 if (STMT_SLP_TYPE (stmt_info))
1508 possible_npeel_number
1509 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1510 else
1511 possible_npeel_number = vf / nelements;
1514 /* Handle the aligned case. We may decide to align some other
1515 access, making DR unaligned. */
1516 if (DR_MISALIGNMENT (dr) == 0)
1518 npeel_tmp = 0;
1519 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1520 possible_npeel_number++;
1523 for (j = 0; j < possible_npeel_number; j++)
1525 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1526 dr, npeel_tmp);
1527 npeel_tmp += nelements;
1530 all_misalignments_unknown = false;
1531 /* Data-ref that was chosen for the case that all the
1532 misalignments are unknown is not relevant anymore, since we
1533 have a data-ref with known alignment. */
1534 dr0 = NULL;
1536 else
1538 /* If we don't know any misalignment values, we prefer
1539 peeling for data-ref that has the maximum number of data-refs
1540 with the same alignment, unless the target prefers to align
1541 stores over load. */
1542 if (all_misalignments_unknown)
1544 unsigned same_align_drs
1545 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1546 if (!dr0
1547 || same_align_drs_max < same_align_drs)
1549 same_align_drs_max = same_align_drs;
1550 dr0 = dr;
1552 /* For data-refs with the same number of related
1553 accesses prefer the one where the misalign
1554 computation will be invariant in the outermost loop. */
1555 else if (same_align_drs_max == same_align_drs)
1557 struct loop *ivloop0, *ivloop;
1558 ivloop0 = outermost_invariant_loop_for_expr
1559 (loop, DR_BASE_ADDRESS (dr0));
1560 ivloop = outermost_invariant_loop_for_expr
1561 (loop, DR_BASE_ADDRESS (dr));
1562 if ((ivloop && !ivloop0)
1563 || (ivloop && ivloop0
1564 && flow_loop_nested_p (ivloop, ivloop0)))
1565 dr0 = dr;
1568 if (!first_store && DR_IS_WRITE (dr))
1569 first_store = dr;
1572 /* If there are both known and unknown misaligned accesses in the
1573 loop, we choose peeling amount according to the known
1574 accesses. */
1575 if (!supportable_dr_alignment)
1577 dr0 = dr;
1578 if (!first_store && DR_IS_WRITE (dr))
1579 first_store = dr;
1583 else
1585 if (!aligned_access_p (dr))
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589 "vector alignment may not be reachable\n");
1590 break;
1595 /* Check if we can possibly peel the loop. */
1596 if (!vect_can_advance_ivs_p (loop_vinfo)
1597 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1598 || loop->inner)
1599 do_peeling = false;
1601 if (do_peeling
1602 && all_misalignments_unknown
1603 && vect_supportable_dr_alignment (dr0, false))
1605 /* Check if the target requires to prefer stores over loads, i.e., if
1606 misaligned stores are more expensive than misaligned loads (taking
1607 drs with same alignment into account). */
1608 if (first_store && DR_IS_READ (dr0))
1610 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1611 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1612 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1613 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1614 stmt_vector_for_cost dummy;
1615 dummy.create (2);
1617 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1618 &dummy);
1619 vect_get_data_access_cost (first_store, &store_inside_cost,
1620 &store_outside_cost, &dummy);
1622 dummy.release ();
1624 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1625 aligning the load DR0). */
1626 load_inside_penalty = store_inside_cost;
1627 load_outside_penalty = store_outside_cost;
1628 for (i = 0;
1629 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1630 DR_STMT (first_store))).iterate (i, &dr);
1631 i++)
1632 if (DR_IS_READ (dr))
1634 load_inside_penalty += load_inside_cost;
1635 load_outside_penalty += load_outside_cost;
1637 else
1639 load_inside_penalty += store_inside_cost;
1640 load_outside_penalty += store_outside_cost;
1643 /* Calculate the penalty for leaving DR0 unaligned (by
1644 aligning the FIRST_STORE). */
1645 store_inside_penalty = load_inside_cost;
1646 store_outside_penalty = load_outside_cost;
1647 for (i = 0;
1648 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1649 DR_STMT (dr0))).iterate (i, &dr);
1650 i++)
1651 if (DR_IS_READ (dr))
1653 store_inside_penalty += load_inside_cost;
1654 store_outside_penalty += load_outside_cost;
1656 else
1658 store_inside_penalty += store_inside_cost;
1659 store_outside_penalty += store_outside_cost;
1662 if (load_inside_penalty > store_inside_penalty
1663 || (load_inside_penalty == store_inside_penalty
1664 && load_outside_penalty > store_outside_penalty))
1665 dr0 = first_store;
1668 /* In case there are only loads with different unknown misalignments, use
1669 peeling only if it may help to align other accesses in the loop or
1670 if it may help improving load bandwith when we'd end up using
1671 unaligned loads. */
1672 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1673 if (!first_store
1674 && !STMT_VINFO_SAME_ALIGN_REFS (
1675 vinfo_for_stmt (DR_STMT (dr0))).length ()
1676 && (vect_supportable_dr_alignment (dr0, false)
1677 != dr_unaligned_supported
1678 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1679 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1680 do_peeling = false;
1683 if (do_peeling && !dr0)
1685 /* Peeling is possible, but there is no data access that is not supported
1686 unless aligned. So we try to choose the best possible peeling. */
1688 /* We should get here only if there are drs with known misalignment. */
1689 gcc_assert (!all_misalignments_unknown);
1691 /* Choose the best peeling from the hash table. */
1692 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1693 loop_vinfo, &npeel,
1694 &body_cost_vec);
1695 if (!dr0 || !npeel)
1696 do_peeling = false;
1699 if (do_peeling)
1701 stmt = DR_STMT (dr0);
1702 stmt_info = vinfo_for_stmt (stmt);
1703 vectype = STMT_VINFO_VECTYPE (stmt_info);
1704 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1706 if (known_alignment_for_access_p (dr0))
1708 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1709 size_zero_node) < 0;
1710 if (!npeel)
1712 /* Since it's known at compile time, compute the number of
1713 iterations in the peeled loop (the peeling factor) for use in
1714 updating DR_MISALIGNMENT values. The peeling factor is the
1715 vectorization factor minus the misalignment as an element
1716 count. */
1717 mis = DR_MISALIGNMENT (dr0);
1718 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1719 npeel = ((negative ? mis - nelements : nelements - mis)
1720 & (nelements - 1));
1723 /* For interleaved data access every iteration accesses all the
1724 members of the group, therefore we divide the number of iterations
1725 by the group size. */
1726 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1727 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1728 npeel /= GROUP_SIZE (stmt_info);
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location,
1732 "Try peeling by %d\n", npeel);
1735 /* Ensure that all data refs can be vectorized after the peel. */
1736 FOR_EACH_VEC_ELT (datarefs, i, dr)
1738 int save_misalignment;
1740 if (dr == dr0)
1741 continue;
1743 stmt = DR_STMT (dr);
1744 stmt_info = vinfo_for_stmt (stmt);
1745 /* For interleaving, only the alignment of the first access
1746 matters. */
1747 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1748 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1749 continue;
1751 /* Strided accesses perform only component accesses, alignment is
1752 irrelevant for them. */
1753 if (STMT_VINFO_STRIDED_P (stmt_info)
1754 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755 continue;
1757 save_misalignment = DR_MISALIGNMENT (dr);
1758 vect_update_misalignment_for_peel (dr, dr0, npeel);
1759 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1760 SET_DR_MISALIGNMENT (dr, save_misalignment);
1762 if (!supportable_dr_alignment)
1764 do_peeling = false;
1765 break;
1769 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1771 stat = vect_verify_datarefs_alignment (loop_vinfo);
1772 if (!stat)
1773 do_peeling = false;
1774 else
1776 body_cost_vec.release ();
1777 return stat;
1781 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1782 if (do_peeling)
1784 unsigned max_allowed_peel
1785 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1786 if (max_allowed_peel != (unsigned)-1)
1788 unsigned max_peel = npeel;
1789 if (max_peel == 0)
1791 gimple *dr_stmt = DR_STMT (dr0);
1792 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1793 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1794 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1796 if (max_peel > max_allowed_peel)
1798 do_peeling = false;
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_NOTE, vect_location,
1801 "Disable peeling, max peels reached: %d\n", max_peel);
1806 /* Cost model #2 - if peeling may result in a remaining loop not
1807 iterating enough to be vectorized then do not peel. */
1808 if (do_peeling
1809 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1811 unsigned max_peel
1812 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1813 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1814 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1815 do_peeling = false;
1818 if (do_peeling)
1820 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1821 If the misalignment of DR_i is identical to that of dr0 then set
1822 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1823 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1824 by the peeling factor times the element size of DR_i (MOD the
1825 vectorization factor times the size). Otherwise, the
1826 misalignment of DR_i must be set to unknown. */
1827 FOR_EACH_VEC_ELT (datarefs, i, dr)
1828 if (dr != dr0)
1830 /* Strided accesses perform only component accesses, alignment
1831 is irrelevant for them. */
1832 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1833 if (STMT_VINFO_STRIDED_P (stmt_info)
1834 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1835 continue;
1837 vect_update_misalignment_for_peel (dr, dr0, npeel);
1840 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1841 if (npeel)
1842 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1843 else
1844 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1845 = DR_MISALIGNMENT (dr0);
1846 SET_DR_MISALIGNMENT (dr0, 0);
1847 if (dump_enabled_p ())
1849 dump_printf_loc (MSG_NOTE, vect_location,
1850 "Alignment of access forced using peeling.\n");
1851 dump_printf_loc (MSG_NOTE, vect_location,
1852 "Peeling for alignment will be applied.\n");
1854 /* The inside-loop cost will be accounted for in vectorizable_load
1855 and vectorizable_store correctly with adjusted alignments.
1856 Drop the body_cst_vec on the floor here. */
1857 body_cost_vec.release ();
1859 stat = vect_verify_datarefs_alignment (loop_vinfo);
1860 gcc_assert (stat);
1861 return stat;
1865 body_cost_vec.release ();
1867 /* (2) Versioning to force alignment. */
1869 /* Try versioning if:
1870 1) optimize loop for speed
1871 2) there is at least one unsupported misaligned data ref with an unknown
1872 misalignment, and
1873 3) all misaligned data refs with a known misalignment are supported, and
1874 4) the number of runtime alignment checks is within reason. */
1876 do_versioning =
1877 optimize_loop_nest_for_speed_p (loop)
1878 && (!loop->inner); /* FORNOW */
1880 if (do_versioning)
1882 FOR_EACH_VEC_ELT (datarefs, i, dr)
1884 stmt = DR_STMT (dr);
1885 stmt_info = vinfo_for_stmt (stmt);
1887 /* For interleaving, only the alignment of the first access
1888 matters. */
1889 if (aligned_access_p (dr)
1890 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1891 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1892 continue;
1894 if (STMT_VINFO_STRIDED_P (stmt_info))
1896 /* Strided loads perform only component accesses, alignment is
1897 irrelevant for them. */
1898 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1899 continue;
1900 do_versioning = false;
1901 break;
1904 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1906 if (!supportable_dr_alignment)
1908 gimple *stmt;
1909 int mask;
1910 tree vectype;
1912 if (known_alignment_for_access_p (dr)
1913 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1914 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1916 do_versioning = false;
1917 break;
1920 stmt = DR_STMT (dr);
1921 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1922 gcc_assert (vectype);
1924 /* The rightmost bits of an aligned address must be zeros.
1925 Construct the mask needed for this test. For example,
1926 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1927 mask must be 15 = 0xf. */
1928 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1930 /* FORNOW: use the same mask to test all potentially unaligned
1931 references in the loop. The vectorizer currently supports
1932 a single vector size, see the reference to
1933 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1934 vectorization factor is computed. */
1935 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1936 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1937 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1938 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1939 DR_STMT (dr));
1943 /* Versioning requires at least one misaligned data reference. */
1944 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1945 do_versioning = false;
1946 else if (!do_versioning)
1947 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1950 if (do_versioning)
1952 vec<gimple *> may_misalign_stmts
1953 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1954 gimple *stmt;
1956 /* It can now be assumed that the data references in the statements
1957 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1958 of the loop being vectorized. */
1959 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1961 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1962 dr = STMT_VINFO_DATA_REF (stmt_info);
1963 SET_DR_MISALIGNMENT (dr, 0);
1964 if (dump_enabled_p ())
1965 dump_printf_loc (MSG_NOTE, vect_location,
1966 "Alignment of access forced using versioning.\n");
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_NOTE, vect_location,
1971 "Versioning for alignment will be applied.\n");
1973 /* Peeling and versioning can't be done together at this time. */
1974 gcc_assert (! (do_peeling && do_versioning));
1976 stat = vect_verify_datarefs_alignment (loop_vinfo);
1977 gcc_assert (stat);
1978 return stat;
1981 /* This point is reached if neither peeling nor versioning is being done. */
1982 gcc_assert (! (do_peeling || do_versioning));
1984 stat = vect_verify_datarefs_alignment (loop_vinfo);
1985 return stat;
1989 /* Function vect_find_same_alignment_drs.
1991 Update group and alignment relations according to the chosen
1992 vectorization factor. */
1994 static void
1995 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1996 loop_vec_info loop_vinfo)
1998 unsigned int i;
1999 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2000 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001 struct data_reference *dra = DDR_A (ddr);
2002 struct data_reference *drb = DDR_B (ddr);
2003 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2004 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2005 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2006 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2007 lambda_vector dist_v;
2008 unsigned int loop_depth;
2010 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2011 return;
2013 if (dra == drb)
2014 return;
2016 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2017 return;
2019 /* Loop-based vectorization and known data dependence. */
2020 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2021 return;
2023 /* Data-dependence analysis reports a distance vector of zero
2024 for data-references that overlap only in the first iteration
2025 but have different sign step (see PR45764).
2026 So as a sanity check require equal DR_STEP. */
2027 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2028 return;
2030 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2031 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2033 int dist = dist_v[loop_depth];
2035 if (dump_enabled_p ())
2036 dump_printf_loc (MSG_NOTE, vect_location,
2037 "dependence distance = %d.\n", dist);
2039 /* Same loop iteration. */
2040 if (dist == 0
2041 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2043 /* Two references with distance zero have the same alignment. */
2044 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2045 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2046 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "accesses have the same alignment.\n");
2050 dump_printf (MSG_NOTE,
2051 "dependence distance modulo vf == 0 between ");
2052 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2053 dump_printf (MSG_NOTE, " and ");
2054 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2055 dump_printf (MSG_NOTE, "\n");
2062 /* Function vect_analyze_data_refs_alignment
2064 Analyze the alignment of the data-references in the loop.
2065 Return FALSE if a data reference is found that cannot be vectorized. */
2067 bool
2068 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2070 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_NOTE, vect_location,
2072 "=== vect_analyze_data_refs_alignment ===\n");
2074 /* Mark groups of data references with same alignment using
2075 data dependence information. */
2076 vec<ddr_p> ddrs = vinfo->ddrs;
2077 struct data_dependence_relation *ddr;
2078 unsigned int i;
2080 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2081 vect_find_same_alignment_drs (ddr, vinfo);
2083 vec<data_reference_p> datarefs = vinfo->datarefs;
2084 struct data_reference *dr;
2086 FOR_EACH_VEC_ELT (datarefs, i, dr)
2088 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2089 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2090 && !vect_compute_data_ref_alignment (dr))
2092 /* Strided accesses perform only component accesses, misalignment
2093 information is irrelevant for them. */
2094 if (STMT_VINFO_STRIDED_P (stmt_info)
2095 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2096 continue;
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 "not vectorized: can't calculate alignment "
2101 "for data ref.\n");
2103 return false;
2107 return true;
2111 /* Analyze alignment of DRs of stmts in NODE. */
2113 static bool
2114 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2116 /* We vectorize from the first scalar stmt in the node unless
2117 the node is permuted in which case we start from the first
2118 element in the group. */
2119 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2120 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2121 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2122 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2124 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2125 if (! vect_compute_data_ref_alignment (dr)
2126 /* For creating the data-ref pointer we need alignment of the
2127 first element anyway. */
2128 || (dr != first_dr
2129 && ! vect_compute_data_ref_alignment (first_dr))
2130 || ! verify_data_ref_alignment (dr))
2132 if (dump_enabled_p ())
2133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2134 "not vectorized: bad data alignment in basic "
2135 "block.\n");
2136 return false;
2139 return true;
2142 /* Function vect_slp_analyze_instance_alignment
2144 Analyze the alignment of the data-references in the SLP instance.
2145 Return FALSE if a data reference is found that cannot be vectorized. */
2147 bool
2148 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_NOTE, vect_location,
2152 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2154 slp_tree node;
2155 unsigned i;
2156 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2157 if (! vect_slp_analyze_and_verify_node_alignment (node))
2158 return false;
2160 node = SLP_INSTANCE_TREE (instance);
2161 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2162 && ! vect_slp_analyze_and_verify_node_alignment
2163 (SLP_INSTANCE_TREE (instance)))
2164 return false;
2166 return true;
2170 /* Analyze groups of accesses: check that DR belongs to a group of
2171 accesses of legal size, step, etc. Detect gaps, single element
2172 interleaving, and other special cases. Set grouped access info.
2173 Collect groups of strided stores for further use in SLP analysis.
2174 Worker for vect_analyze_group_access. */
2176 static bool
2177 vect_analyze_group_access_1 (struct data_reference *dr)
2179 tree step = DR_STEP (dr);
2180 tree scalar_type = TREE_TYPE (DR_REF (dr));
2181 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2182 gimple *stmt = DR_STMT (dr);
2183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2184 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2185 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2186 HOST_WIDE_INT dr_step = -1;
2187 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2188 bool slp_impossible = false;
2190 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2191 size of the interleaving group (including gaps). */
2192 if (tree_fits_shwi_p (step))
2194 dr_step = tree_to_shwi (step);
2195 /* Check that STEP is a multiple of type size. Otherwise there is
2196 a non-element-sized gap at the end of the group which we
2197 cannot represent in GROUP_GAP or GROUP_SIZE.
2198 ??? As we can handle non-constant step fine here we should
2199 simply remove uses of GROUP_GAP between the last and first
2200 element and instead rely on DR_STEP. GROUP_SIZE then would
2201 simply not include that gap. */
2202 if ((dr_step % type_size) != 0)
2204 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_NOTE, vect_location,
2207 "Step ");
2208 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2209 dump_printf (MSG_NOTE,
2210 " is not a multiple of the element size for ");
2211 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2212 dump_printf (MSG_NOTE, "\n");
2214 return false;
2216 groupsize = absu_hwi (dr_step) / type_size;
2218 else
2219 groupsize = 0;
2221 /* Not consecutive access is possible only if it is a part of interleaving. */
2222 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2224 /* Check if it this DR is a part of interleaving, and is a single
2225 element of the group that is accessed in the loop. */
2227 /* Gaps are supported only for loads. STEP must be a multiple of the type
2228 size. The size of the group must be a power of 2. */
2229 if (DR_IS_READ (dr)
2230 && (dr_step % type_size) == 0
2231 && groupsize > 0
2232 && exact_log2 (groupsize) != -1)
2234 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2235 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2236 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_NOTE, vect_location,
2239 "Detected single element interleaving ");
2240 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2241 dump_printf (MSG_NOTE, " step ");
2242 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2243 dump_printf (MSG_NOTE, "\n");
2246 return true;
2249 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 "not consecutive access ");
2253 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2256 if (bb_vinfo)
2258 /* Mark the statement as unvectorizable. */
2259 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2260 return true;
2263 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2264 STMT_VINFO_STRIDED_P (stmt_info) = true;
2265 return true;
2268 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2270 /* First stmt in the interleaving chain. Check the chain. */
2271 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2272 struct data_reference *data_ref = dr;
2273 unsigned int count = 1;
2274 tree prev_init = DR_INIT (data_ref);
2275 gimple *prev = stmt;
2276 HOST_WIDE_INT diff, gaps = 0;
2278 while (next)
2280 /* Skip same data-refs. In case that two or more stmts share
2281 data-ref (supported only for loads), we vectorize only the first
2282 stmt, and the rest get their vectorized loads from the first
2283 one. */
2284 if (!tree_int_cst_compare (DR_INIT (data_ref),
2285 DR_INIT (STMT_VINFO_DATA_REF (
2286 vinfo_for_stmt (next)))))
2288 if (DR_IS_WRITE (data_ref))
2290 if (dump_enabled_p ())
2291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2292 "Two store stmts share the same dr.\n");
2293 return false;
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "Two or more load stmts share the same dr.\n");
2300 /* For load use the same data-ref load. */
2301 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2303 prev = next;
2304 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2305 continue;
2308 prev = next;
2309 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2311 /* All group members have the same STEP by construction. */
2312 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2314 /* Check that the distance between two accesses is equal to the type
2315 size. Otherwise, we have gaps. */
2316 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2317 - TREE_INT_CST_LOW (prev_init)) / type_size;
2318 if (diff != 1)
2320 /* FORNOW: SLP of accesses with gaps is not supported. */
2321 slp_impossible = true;
2322 if (DR_IS_WRITE (data_ref))
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326 "interleaved store with gaps\n");
2327 return false;
2330 gaps += diff - 1;
2333 last_accessed_element += diff;
2335 /* Store the gap from the previous member of the group. If there is no
2336 gap in the access, GROUP_GAP is always 1. */
2337 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2339 prev_init = DR_INIT (data_ref);
2340 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2341 /* Count the number of data-refs in the chain. */
2342 count++;
2345 if (groupsize == 0)
2346 groupsize = count + gaps;
2348 if (groupsize > UINT_MAX)
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352 "group is too large\n");
2353 return false;
2356 /* Check that the size of the interleaving is equal to count for stores,
2357 i.e., that there are no gaps. */
2358 if (groupsize != count
2359 && !DR_IS_READ (dr))
2361 if (dump_enabled_p ())
2362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363 "interleaved store with gaps\n");
2364 return false;
2367 /* If there is a gap after the last load in the group it is the
2368 difference between the groupsize and the last accessed
2369 element.
2370 When there is no gap, this difference should be 0. */
2371 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2373 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2374 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_NOTE, vect_location,
2377 "Detected interleaving ");
2378 if (DR_IS_READ (dr))
2379 dump_printf (MSG_NOTE, "load ");
2380 else
2381 dump_printf (MSG_NOTE, "store ");
2382 dump_printf (MSG_NOTE, "of size %u starting with ",
2383 (unsigned)groupsize);
2384 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2385 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2386 dump_printf_loc (MSG_NOTE, vect_location,
2387 "There is a gap of %u elements after the group\n",
2388 GROUP_GAP (vinfo_for_stmt (stmt)));
2391 /* SLP: create an SLP data structure for every interleaving group of
2392 stores for further analysis in vect_analyse_slp. */
2393 if (DR_IS_WRITE (dr) && !slp_impossible)
2395 if (loop_vinfo)
2396 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2397 if (bb_vinfo)
2398 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2402 return true;
2405 /* Analyze groups of accesses: check that DR belongs to a group of
2406 accesses of legal size, step, etc. Detect gaps, single element
2407 interleaving, and other special cases. Set grouped access info.
2408 Collect groups of strided stores for further use in SLP analysis. */
2410 static bool
2411 vect_analyze_group_access (struct data_reference *dr)
2413 if (!vect_analyze_group_access_1 (dr))
2415 /* Dissolve the group if present. */
2416 gimple *next;
2417 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2418 while (stmt)
2420 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2421 next = GROUP_NEXT_ELEMENT (vinfo);
2422 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2423 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2424 stmt = next;
2426 return false;
2428 return true;
2431 /* Analyze the access pattern of the data-reference DR.
2432 In case of non-consecutive accesses call vect_analyze_group_access() to
2433 analyze groups of accesses. */
2435 static bool
2436 vect_analyze_data_ref_access (struct data_reference *dr)
2438 tree step = DR_STEP (dr);
2439 tree scalar_type = TREE_TYPE (DR_REF (dr));
2440 gimple *stmt = DR_STMT (dr);
2441 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2442 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2443 struct loop *loop = NULL;
2445 if (loop_vinfo)
2446 loop = LOOP_VINFO_LOOP (loop_vinfo);
2448 if (loop_vinfo && !step)
2450 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452 "bad data-ref access in loop\n");
2453 return false;
2456 /* Allow loads with zero step in inner-loop vectorization. */
2457 if (loop_vinfo && integer_zerop (step))
2459 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2460 if (!nested_in_vect_loop_p (loop, stmt))
2461 return DR_IS_READ (dr);
2462 /* Allow references with zero step for outer loops marked
2463 with pragma omp simd only - it guarantees absence of
2464 loop-carried dependencies between inner loop iterations. */
2465 if (!loop->force_vectorize)
2467 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_NOTE, vect_location,
2469 "zero step in inner loop of nest\n");
2470 return false;
2474 if (loop && nested_in_vect_loop_p (loop, stmt))
2476 /* Interleaved accesses are not yet supported within outer-loop
2477 vectorization for references in the inner-loop. */
2478 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2480 /* For the rest of the analysis we use the outer-loop step. */
2481 step = STMT_VINFO_DR_STEP (stmt_info);
2482 if (integer_zerop (step))
2484 if (dump_enabled_p ())
2485 dump_printf_loc (MSG_NOTE, vect_location,
2486 "zero step in outer loop.\n");
2487 return DR_IS_READ (dr);
2491 /* Consecutive? */
2492 if (TREE_CODE (step) == INTEGER_CST)
2494 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2495 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2496 || (dr_step < 0
2497 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2499 /* Mark that it is not interleaving. */
2500 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2501 return true;
2505 if (loop && nested_in_vect_loop_p (loop, stmt))
2507 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "grouped access in outer loop.\n");
2510 return false;
2514 /* Assume this is a DR handled by non-constant strided load case. */
2515 if (TREE_CODE (step) != INTEGER_CST)
2516 return (STMT_VINFO_STRIDED_P (stmt_info)
2517 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2518 || vect_analyze_group_access (dr)));
2520 /* Not consecutive access - check if it's a part of interleaving group. */
2521 return vect_analyze_group_access (dr);
2526 /* A helper function used in the comparator function to sort data
2527 references. T1 and T2 are two data references to be compared.
2528 The function returns -1, 0, or 1. */
2530 static int
2531 compare_tree (tree t1, tree t2)
2533 int i, cmp;
2534 enum tree_code code;
2535 char tclass;
2537 if (t1 == t2)
2538 return 0;
2539 if (t1 == NULL)
2540 return -1;
2541 if (t2 == NULL)
2542 return 1;
2544 STRIP_NOPS (t1);
2545 STRIP_NOPS (t2);
2547 if (TREE_CODE (t1) != TREE_CODE (t2))
2548 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2550 code = TREE_CODE (t1);
2551 switch (code)
2553 /* For const values, we can just use hash values for comparisons. */
2554 case INTEGER_CST:
2555 case REAL_CST:
2556 case FIXED_CST:
2557 case STRING_CST:
2558 case COMPLEX_CST:
2559 case VECTOR_CST:
2561 hashval_t h1 = iterative_hash_expr (t1, 0);
2562 hashval_t h2 = iterative_hash_expr (t2, 0);
2563 if (h1 != h2)
2564 return h1 < h2 ? -1 : 1;
2565 break;
2568 case SSA_NAME:
2569 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2570 if (cmp != 0)
2571 return cmp;
2573 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2574 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2575 break;
2577 default:
2578 tclass = TREE_CODE_CLASS (code);
2580 /* For var-decl, we could compare their UIDs. */
2581 if (tclass == tcc_declaration)
2583 if (DECL_UID (t1) != DECL_UID (t2))
2584 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2585 break;
2588 /* For expressions with operands, compare their operands recursively. */
2589 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2591 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2592 if (cmp != 0)
2593 return cmp;
2597 return 0;
2601 /* Compare two data-references DRA and DRB to group them into chunks
2602 suitable for grouping. */
2604 static int
2605 dr_group_sort_cmp (const void *dra_, const void *drb_)
2607 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2608 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2609 int cmp;
2611 /* Stabilize sort. */
2612 if (dra == drb)
2613 return 0;
2615 /* DRs in different loops never belong to the same group. */
2616 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2617 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2618 if (loopa != loopb)
2619 return loopa->num < loopb->num ? -1 : 1;
2621 /* Ordering of DRs according to base. */
2622 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2624 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2625 if (cmp != 0)
2626 return cmp;
2629 /* And according to DR_OFFSET. */
2630 if (!dr_equal_offsets_p (dra, drb))
2632 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2633 if (cmp != 0)
2634 return cmp;
2637 /* Put reads before writes. */
2638 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2639 return DR_IS_READ (dra) ? -1 : 1;
2641 /* Then sort after access size. */
2642 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2643 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2645 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2646 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2647 if (cmp != 0)
2648 return cmp;
2651 /* And after step. */
2652 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2654 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2655 if (cmp != 0)
2656 return cmp;
2659 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2660 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2661 if (cmp == 0)
2662 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2663 return cmp;
2666 /* Function vect_analyze_data_ref_accesses.
2668 Analyze the access pattern of all the data references in the loop.
2670 FORNOW: the only access pattern that is considered vectorizable is a
2671 simple step 1 (consecutive) access.
2673 FORNOW: handle only arrays and pointer accesses. */
2675 bool
2676 vect_analyze_data_ref_accesses (vec_info *vinfo)
2678 unsigned int i;
2679 vec<data_reference_p> datarefs = vinfo->datarefs;
2680 struct data_reference *dr;
2682 if (dump_enabled_p ())
2683 dump_printf_loc (MSG_NOTE, vect_location,
2684 "=== vect_analyze_data_ref_accesses ===\n");
2686 if (datarefs.is_empty ())
2687 return true;
2689 /* Sort the array of datarefs to make building the interleaving chains
2690 linear. Don't modify the original vector's order, it is needed for
2691 determining what dependencies are reversed. */
2692 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2693 datarefs_copy.qsort (dr_group_sort_cmp);
2695 /* Build the interleaving chains. */
2696 for (i = 0; i < datarefs_copy.length () - 1;)
2698 data_reference_p dra = datarefs_copy[i];
2699 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2700 stmt_vec_info lastinfo = NULL;
2701 for (i = i + 1; i < datarefs_copy.length (); ++i)
2703 data_reference_p drb = datarefs_copy[i];
2704 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2706 /* ??? Imperfect sorting (non-compatible types, non-modulo
2707 accesses, same accesses) can lead to a group to be artificially
2708 split here as we don't just skip over those. If it really
2709 matters we can push those to a worklist and re-iterate
2710 over them. The we can just skip ahead to the next DR here. */
2712 /* DRs in a different loop should not be put into the same
2713 interleaving group. */
2714 if (gimple_bb (DR_STMT (dra))->loop_father
2715 != gimple_bb (DR_STMT (drb))->loop_father)
2716 break;
2718 /* Check that the data-refs have same first location (except init)
2719 and they are both either store or load (not load and store,
2720 not masked loads or stores). */
2721 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2722 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2723 DR_BASE_ADDRESS (drb), 0)
2724 || !dr_equal_offsets_p (dra, drb)
2725 || !gimple_assign_single_p (DR_STMT (dra))
2726 || !gimple_assign_single_p (DR_STMT (drb)))
2727 break;
2729 /* Check that the data-refs have the same constant size. */
2730 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2731 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2732 if (!tree_fits_uhwi_p (sza)
2733 || !tree_fits_uhwi_p (szb)
2734 || !tree_int_cst_equal (sza, szb))
2735 break;
2737 /* Check that the data-refs have the same step. */
2738 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2739 break;
2741 /* Do not place the same access in the interleaving chain twice. */
2742 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2743 break;
2745 /* Check the types are compatible.
2746 ??? We don't distinguish this during sorting. */
2747 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2748 TREE_TYPE (DR_REF (drb))))
2749 break;
2751 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2752 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2753 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2754 gcc_assert (init_a < init_b);
2756 /* If init_b == init_a + the size of the type * k, we have an
2757 interleaving, and DRA is accessed before DRB. */
2758 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2759 if (type_size_a == 0
2760 || (init_b - init_a) % type_size_a != 0)
2761 break;
2763 /* If we have a store, the accesses are adjacent. This splits
2764 groups into chunks we support (we don't support vectorization
2765 of stores with gaps). */
2766 if (!DR_IS_READ (dra)
2767 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2768 (DR_INIT (datarefs_copy[i-1]))
2769 != type_size_a))
2770 break;
2772 /* If the step (if not zero or non-constant) is greater than the
2773 difference between data-refs' inits this splits groups into
2774 suitable sizes. */
2775 if (tree_fits_shwi_p (DR_STEP (dra)))
2777 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2778 if (step != 0 && step <= (init_b - init_a))
2779 break;
2782 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE, vect_location,
2785 "Detected interleaving ");
2786 if (DR_IS_READ (dra))
2787 dump_printf (MSG_NOTE, "load ");
2788 else
2789 dump_printf (MSG_NOTE, "store ");
2790 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2791 dump_printf (MSG_NOTE, " and ");
2792 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2793 dump_printf (MSG_NOTE, "\n");
2796 /* Link the found element into the group list. */
2797 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2799 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2800 lastinfo = stmtinfo_a;
2802 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2803 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2804 lastinfo = stmtinfo_b;
2808 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2809 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2810 && !vect_analyze_data_ref_access (dr))
2812 if (dump_enabled_p ())
2813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2814 "not vectorized: complicated access pattern.\n");
2816 if (is_a <bb_vec_info> (vinfo))
2818 /* Mark the statement as not vectorizable. */
2819 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2820 continue;
2822 else
2824 datarefs_copy.release ();
2825 return false;
2829 datarefs_copy.release ();
2830 return true;
2834 /* Operator == between two dr_with_seg_len objects.
2836 This equality operator is used to make sure two data refs
2837 are the same one so that we will consider to combine the
2838 aliasing checks of those two pairs of data dependent data
2839 refs. */
2841 static bool
2842 operator == (const dr_with_seg_len& d1,
2843 const dr_with_seg_len& d2)
2845 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2846 DR_BASE_ADDRESS (d2.dr), 0)
2847 && compare_tree (d1.offset, d2.offset) == 0
2848 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2851 /* Function comp_dr_with_seg_len_pair.
2853 Comparison function for sorting objects of dr_with_seg_len_pair_t
2854 so that we can combine aliasing checks in one scan. */
2856 static int
2857 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2859 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2860 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2862 const dr_with_seg_len &p11 = p1->first,
2863 &p12 = p1->second,
2864 &p21 = p2->first,
2865 &p22 = p2->second;
2867 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2868 if a and c have the same basic address snd step, and b and d have the same
2869 address and step. Therefore, if any a&c or b&d don't have the same address
2870 and step, we don't care the order of those two pairs after sorting. */
2871 int comp_res;
2873 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2874 DR_BASE_ADDRESS (p21.dr))) != 0)
2875 return comp_res;
2876 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2877 DR_BASE_ADDRESS (p22.dr))) != 0)
2878 return comp_res;
2879 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2880 return comp_res;
2881 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2882 return comp_res;
2883 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2884 return comp_res;
2885 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2886 return comp_res;
2888 return 0;
2891 /* Function vect_vfa_segment_size.
2893 Create an expression that computes the size of segment
2894 that will be accessed for a data reference. The functions takes into
2895 account that realignment loads may access one more vector.
2897 Input:
2898 DR: The data reference.
2899 LENGTH_FACTOR: segment length to consider.
2901 Return an expression whose value is the size of segment which will be
2902 accessed by DR. */
2904 static tree
2905 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2907 tree segment_length;
2909 if (integer_zerop (DR_STEP (dr)))
2910 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2911 else
2912 segment_length = size_binop (MULT_EXPR,
2913 fold_convert (sizetype, DR_STEP (dr)),
2914 fold_convert (sizetype, length_factor));
2916 if (vect_supportable_dr_alignment (dr, false)
2917 == dr_explicit_realign_optimized)
2919 tree vector_size = TYPE_SIZE_UNIT
2920 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2922 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2924 return segment_length;
2927 /* Function vect_prune_runtime_alias_test_list.
2929 Prune a list of ddrs to be tested at run-time by versioning for alias.
2930 Merge several alias checks into one if possible.
2931 Return FALSE if resulting list of ddrs is longer then allowed by
2932 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2934 bool
2935 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2937 vec<ddr_p> may_alias_ddrs =
2938 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2939 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2940 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2941 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2942 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2944 ddr_p ddr;
2945 unsigned int i;
2946 tree length_factor;
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location,
2950 "=== vect_prune_runtime_alias_test_list ===\n");
2952 if (may_alias_ddrs.is_empty ())
2953 return true;
2955 /* Basically, for each pair of dependent data refs store_ptr_0
2956 and load_ptr_0, we create an expression:
2958 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2959 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2961 for aliasing checks. However, in some cases we can decrease
2962 the number of checks by combining two checks into one. For
2963 example, suppose we have another pair of data refs store_ptr_0
2964 and load_ptr_1, and if the following condition is satisfied:
2966 load_ptr_0 < load_ptr_1 &&
2967 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2969 (this condition means, in each iteration of vectorized loop,
2970 the accessed memory of store_ptr_0 cannot be between the memory
2971 of load_ptr_0 and load_ptr_1.)
2973 we then can use only the following expression to finish the
2974 alising checks between store_ptr_0 & load_ptr_0 and
2975 store_ptr_0 & load_ptr_1:
2977 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2978 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2980 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2981 same basic address. */
2983 comp_alias_ddrs.create (may_alias_ddrs.length ());
2985 /* First, we collect all data ref pairs for aliasing checks. */
2986 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2988 struct data_reference *dr_a, *dr_b;
2989 gimple *dr_group_first_a, *dr_group_first_b;
2990 tree segment_length_a, segment_length_b;
2991 gimple *stmt_a, *stmt_b;
2993 dr_a = DDR_A (ddr);
2994 stmt_a = DR_STMT (DDR_A (ddr));
2995 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2996 if (dr_group_first_a)
2998 stmt_a = dr_group_first_a;
2999 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3002 dr_b = DDR_B (ddr);
3003 stmt_b = DR_STMT (DDR_B (ddr));
3004 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3005 if (dr_group_first_b)
3007 stmt_b = dr_group_first_b;
3008 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3011 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3012 length_factor = scalar_loop_iters;
3013 else
3014 length_factor = size_int (vect_factor);
3015 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3016 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3018 dr_with_seg_len_pair_t dr_with_seg_len_pair
3019 (dr_with_seg_len (dr_a, segment_length_a),
3020 dr_with_seg_len (dr_b, segment_length_b));
3022 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3023 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3025 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3028 /* Second, we sort the collected data ref pairs so that we can scan
3029 them once to combine all possible aliasing checks. */
3030 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3032 /* Third, we scan the sorted dr pairs and check if we can combine
3033 alias checks of two neighbouring dr pairs. */
3034 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3036 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3037 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3038 *dr_b1 = &comp_alias_ddrs[i-1].second,
3039 *dr_a2 = &comp_alias_ddrs[i].first,
3040 *dr_b2 = &comp_alias_ddrs[i].second;
3042 /* Remove duplicate data ref pairs. */
3043 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3045 if (dump_enabled_p ())
3047 dump_printf_loc (MSG_NOTE, vect_location,
3048 "found equal ranges ");
3049 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3050 DR_REF (dr_a1->dr));
3051 dump_printf (MSG_NOTE, ", ");
3052 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3053 DR_REF (dr_b1->dr));
3054 dump_printf (MSG_NOTE, " and ");
3055 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3056 DR_REF (dr_a2->dr));
3057 dump_printf (MSG_NOTE, ", ");
3058 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3059 DR_REF (dr_b2->dr));
3060 dump_printf (MSG_NOTE, "\n");
3063 comp_alias_ddrs.ordered_remove (i--);
3064 continue;
3067 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3069 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3070 and DR_A1 and DR_A2 are two consecutive memrefs. */
3071 if (*dr_a1 == *dr_a2)
3073 std::swap (dr_a1, dr_b1);
3074 std::swap (dr_a2, dr_b2);
3077 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3078 DR_BASE_ADDRESS (dr_a2->dr),
3080 || !tree_fits_shwi_p (dr_a1->offset)
3081 || !tree_fits_shwi_p (dr_a2->offset))
3082 continue;
3084 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
3085 - tree_to_shwi (dr_a1->offset));
3088 /* Now we check if the following condition is satisfied:
3090 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3092 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3093 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3094 have to make a best estimation. We can get the minimum value
3095 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3096 then either of the following two conditions can guarantee the
3097 one above:
3099 1: DIFF <= MIN_SEG_LEN_B
3100 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3104 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
3105 ? tree_to_shwi (dr_b1->seg_len)
3106 : vect_factor);
3108 if (diff <= min_seg_len_b
3109 || (tree_fits_shwi_p (dr_a1->seg_len)
3110 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
3112 if (dump_enabled_p ())
3114 dump_printf_loc (MSG_NOTE, vect_location,
3115 "merging ranges for ");
3116 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3117 DR_REF (dr_a1->dr));
3118 dump_printf (MSG_NOTE, ", ");
3119 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3120 DR_REF (dr_b1->dr));
3121 dump_printf (MSG_NOTE, " and ");
3122 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3123 DR_REF (dr_a2->dr));
3124 dump_printf (MSG_NOTE, ", ");
3125 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126 DR_REF (dr_b2->dr));
3127 dump_printf (MSG_NOTE, "\n");
3130 dr_a1->seg_len = size_binop (PLUS_EXPR,
3131 dr_a2->seg_len, size_int (diff));
3132 comp_alias_ddrs.ordered_remove (i--);
3137 dump_printf_loc (MSG_NOTE, vect_location,
3138 "improved number of alias checks from %d to %d\n",
3139 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3140 if ((int) comp_alias_ddrs.length () >
3141 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3142 return false;
3144 return true;
3147 /* Check whether a non-affine read or write in stmt is suitable for gather load
3148 or scatter store and if so, return a builtin decl for that operation. */
3150 tree
3151 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3152 tree *offp, int *scalep)
3154 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3156 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3157 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3158 tree offtype = NULL_TREE;
3159 tree decl, base, off;
3160 machine_mode pmode;
3161 int punsignedp, reversep, pvolatilep = 0;
3163 base = DR_REF (dr);
3164 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3165 see if we can use the def stmt of the address. */
3166 if (is_gimple_call (stmt)
3167 && gimple_call_internal_p (stmt)
3168 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3169 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3170 && TREE_CODE (base) == MEM_REF
3171 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3172 && integer_zerop (TREE_OPERAND (base, 1))
3173 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3175 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3176 if (is_gimple_assign (def_stmt)
3177 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3178 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3181 /* The gather and scatter builtins need address of the form
3182 loop_invariant + vector * {1, 2, 4, 8}
3184 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3185 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3186 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3187 multiplications and additions in it. To get a vector, we need
3188 a single SSA_NAME that will be defined in the loop and will
3189 contain everything that is not loop invariant and that can be
3190 vectorized. The following code attempts to find such a preexistng
3191 SSA_NAME OFF and put the loop invariants into a tree BASE
3192 that can be gimplified before the loop. */
3193 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3194 &punsignedp, &reversep, &pvolatilep, false);
3195 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3197 if (TREE_CODE (base) == MEM_REF)
3199 if (!integer_zerop (TREE_OPERAND (base, 1)))
3201 if (off == NULL_TREE)
3203 offset_int moff = mem_ref_offset (base);
3204 off = wide_int_to_tree (sizetype, moff);
3206 else
3207 off = size_binop (PLUS_EXPR, off,
3208 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3210 base = TREE_OPERAND (base, 0);
3212 else
3213 base = build_fold_addr_expr (base);
3215 if (off == NULL_TREE)
3216 off = size_zero_node;
3218 /* If base is not loop invariant, either off is 0, then we start with just
3219 the constant offset in the loop invariant BASE and continue with base
3220 as OFF, otherwise give up.
3221 We could handle that case by gimplifying the addition of base + off
3222 into some SSA_NAME and use that as off, but for now punt. */
3223 if (!expr_invariant_in_loop_p (loop, base))
3225 if (!integer_zerop (off))
3226 return NULL_TREE;
3227 off = base;
3228 base = size_int (pbitpos / BITS_PER_UNIT);
3230 /* Otherwise put base + constant offset into the loop invariant BASE
3231 and continue with OFF. */
3232 else
3234 base = fold_convert (sizetype, base);
3235 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3238 /* OFF at this point may be either a SSA_NAME or some tree expression
3239 from get_inner_reference. Try to peel off loop invariants from it
3240 into BASE as long as possible. */
3241 STRIP_NOPS (off);
3242 while (offtype == NULL_TREE)
3244 enum tree_code code;
3245 tree op0, op1, add = NULL_TREE;
3247 if (TREE_CODE (off) == SSA_NAME)
3249 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3251 if (expr_invariant_in_loop_p (loop, off))
3252 return NULL_TREE;
3254 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3255 break;
3257 op0 = gimple_assign_rhs1 (def_stmt);
3258 code = gimple_assign_rhs_code (def_stmt);
3259 op1 = gimple_assign_rhs2 (def_stmt);
3261 else
3263 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3264 return NULL_TREE;
3265 code = TREE_CODE (off);
3266 extract_ops_from_tree (off, &code, &op0, &op1);
3268 switch (code)
3270 case POINTER_PLUS_EXPR:
3271 case PLUS_EXPR:
3272 if (expr_invariant_in_loop_p (loop, op0))
3274 add = op0;
3275 off = op1;
3276 do_add:
3277 add = fold_convert (sizetype, add);
3278 if (scale != 1)
3279 add = size_binop (MULT_EXPR, add, size_int (scale));
3280 base = size_binop (PLUS_EXPR, base, add);
3281 continue;
3283 if (expr_invariant_in_loop_p (loop, op1))
3285 add = op1;
3286 off = op0;
3287 goto do_add;
3289 break;
3290 case MINUS_EXPR:
3291 if (expr_invariant_in_loop_p (loop, op1))
3293 add = fold_convert (sizetype, op1);
3294 add = size_binop (MINUS_EXPR, size_zero_node, add);
3295 off = op0;
3296 goto do_add;
3298 break;
3299 case MULT_EXPR:
3300 if (scale == 1 && tree_fits_shwi_p (op1))
3302 scale = tree_to_shwi (op1);
3303 off = op0;
3304 continue;
3306 break;
3307 case SSA_NAME:
3308 off = op0;
3309 continue;
3310 CASE_CONVERT:
3311 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3312 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3313 break;
3314 if (TYPE_PRECISION (TREE_TYPE (op0))
3315 == TYPE_PRECISION (TREE_TYPE (off)))
3317 off = op0;
3318 continue;
3320 if (TYPE_PRECISION (TREE_TYPE (op0))
3321 < TYPE_PRECISION (TREE_TYPE (off)))
3323 off = op0;
3324 offtype = TREE_TYPE (off);
3325 STRIP_NOPS (off);
3326 continue;
3328 break;
3329 default:
3330 break;
3332 break;
3335 /* If at the end OFF still isn't a SSA_NAME or isn't
3336 defined in the loop, punt. */
3337 if (TREE_CODE (off) != SSA_NAME
3338 || expr_invariant_in_loop_p (loop, off))
3339 return NULL_TREE;
3341 if (offtype == NULL_TREE)
3342 offtype = TREE_TYPE (off);
3344 if (DR_IS_READ (dr))
3345 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3346 offtype, scale);
3347 else
3348 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3349 offtype, scale);
3351 if (decl == NULL_TREE)
3352 return NULL_TREE;
3354 if (basep)
3355 *basep = base;
3356 if (offp)
3357 *offp = off;
3358 if (scalep)
3359 *scalep = scale;
3360 return decl;
3363 /* Function vect_analyze_data_refs.
3365 Find all the data references in the loop or basic block.
3367 The general structure of the analysis of data refs in the vectorizer is as
3368 follows:
3369 1- vect_analyze_data_refs(loop/bb): call
3370 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3371 in the loop/bb and their dependences.
3372 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3373 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3374 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3378 bool
3379 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3381 struct loop *loop = NULL;
3382 unsigned int i;
3383 struct data_reference *dr;
3384 tree scalar_type;
3386 if (dump_enabled_p ())
3387 dump_printf_loc (MSG_NOTE, vect_location,
3388 "=== vect_analyze_data_refs ===\n");
3390 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3391 loop = LOOP_VINFO_LOOP (loop_vinfo);
3393 /* Go through the data-refs, check that the analysis succeeded. Update
3394 pointer from stmt_vec_info struct to DR and vectype. */
3396 vec<data_reference_p> datarefs = vinfo->datarefs;
3397 FOR_EACH_VEC_ELT (datarefs, i, dr)
3399 gimple *stmt;
3400 stmt_vec_info stmt_info;
3401 tree base, offset, init;
3402 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3403 bool simd_lane_access = false;
3404 int vf;
3406 again:
3407 if (!dr || !DR_REF (dr))
3409 if (dump_enabled_p ())
3410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3411 "not vectorized: unhandled data-ref\n");
3412 return false;
3415 stmt = DR_STMT (dr);
3416 stmt_info = vinfo_for_stmt (stmt);
3418 /* Discard clobbers from the dataref vector. We will remove
3419 clobber stmts during vectorization. */
3420 if (gimple_clobber_p (stmt))
3422 free_data_ref (dr);
3423 if (i == datarefs.length () - 1)
3425 datarefs.pop ();
3426 break;
3428 datarefs.ordered_remove (i);
3429 dr = datarefs[i];
3430 goto again;
3433 /* Check that analysis of the data-ref succeeded. */
3434 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3435 || !DR_STEP (dr))
3437 bool maybe_gather
3438 = DR_IS_READ (dr)
3439 && !TREE_THIS_VOLATILE (DR_REF (dr))
3440 && targetm.vectorize.builtin_gather != NULL;
3441 bool maybe_scatter
3442 = DR_IS_WRITE (dr)
3443 && !TREE_THIS_VOLATILE (DR_REF (dr))
3444 && targetm.vectorize.builtin_scatter != NULL;
3445 bool maybe_simd_lane_access
3446 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3448 /* If target supports vector gather loads or scatter stores, or if
3449 this might be a SIMD lane access, see if they can't be used. */
3450 if (is_a <loop_vec_info> (vinfo)
3451 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3452 && !nested_in_vect_loop_p (loop, stmt))
3454 struct data_reference *newdr
3455 = create_data_ref (NULL, loop_containing_stmt (stmt),
3456 DR_REF (dr), stmt, maybe_scatter ? false : true);
3457 gcc_assert (newdr != NULL && DR_REF (newdr));
3458 if (DR_BASE_ADDRESS (newdr)
3459 && DR_OFFSET (newdr)
3460 && DR_INIT (newdr)
3461 && DR_STEP (newdr)
3462 && integer_zerop (DR_STEP (newdr)))
3464 if (maybe_simd_lane_access)
3466 tree off = DR_OFFSET (newdr);
3467 STRIP_NOPS (off);
3468 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3469 && TREE_CODE (off) == MULT_EXPR
3470 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3472 tree step = TREE_OPERAND (off, 1);
3473 off = TREE_OPERAND (off, 0);
3474 STRIP_NOPS (off);
3475 if (CONVERT_EXPR_P (off)
3476 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3477 0)))
3478 < TYPE_PRECISION (TREE_TYPE (off)))
3479 off = TREE_OPERAND (off, 0);
3480 if (TREE_CODE (off) == SSA_NAME)
3482 gimple *def = SSA_NAME_DEF_STMT (off);
3483 tree reft = TREE_TYPE (DR_REF (newdr));
3484 if (is_gimple_call (def)
3485 && gimple_call_internal_p (def)
3486 && (gimple_call_internal_fn (def)
3487 == IFN_GOMP_SIMD_LANE))
3489 tree arg = gimple_call_arg (def, 0);
3490 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3491 arg = SSA_NAME_VAR (arg);
3492 if (arg == loop->simduid
3493 /* For now. */
3494 && tree_int_cst_equal
3495 (TYPE_SIZE_UNIT (reft),
3496 step))
3498 DR_OFFSET (newdr) = ssize_int (0);
3499 DR_STEP (newdr) = step;
3500 DR_ALIGNED_TO (newdr)
3501 = size_int (BIGGEST_ALIGNMENT);
3502 dr = newdr;
3503 simd_lane_access = true;
3509 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3511 dr = newdr;
3512 if (maybe_gather)
3513 gatherscatter = GATHER;
3514 else
3515 gatherscatter = SCATTER;
3518 if (gatherscatter == SG_NONE && !simd_lane_access)
3519 free_data_ref (newdr);
3522 if (gatherscatter == SG_NONE && !simd_lane_access)
3524 if (dump_enabled_p ())
3526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3527 "not vectorized: data ref analysis "
3528 "failed ");
3529 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3530 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3533 if (is_a <bb_vec_info> (vinfo))
3534 break;
3536 return false;
3540 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3542 if (dump_enabled_p ())
3543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3544 "not vectorized: base addr of dr is a "
3545 "constant\n");
3547 if (is_a <bb_vec_info> (vinfo))
3548 break;
3550 if (gatherscatter != SG_NONE || simd_lane_access)
3551 free_data_ref (dr);
3552 return false;
3555 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3557 if (dump_enabled_p ())
3559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3560 "not vectorized: volatile type ");
3561 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3562 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3565 if (is_a <bb_vec_info> (vinfo))
3566 break;
3568 return false;
3571 if (stmt_can_throw_internal (stmt))
3573 if (dump_enabled_p ())
3575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3576 "not vectorized: statement can throw an "
3577 "exception ");
3578 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3579 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3582 if (is_a <bb_vec_info> (vinfo))
3583 break;
3585 if (gatherscatter != SG_NONE || simd_lane_access)
3586 free_data_ref (dr);
3587 return false;
3590 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3591 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3593 if (dump_enabled_p ())
3595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3596 "not vectorized: statement is bitfield "
3597 "access ");
3598 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3599 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3602 if (is_a <bb_vec_info> (vinfo))
3603 break;
3605 if (gatherscatter != SG_NONE || simd_lane_access)
3606 free_data_ref (dr);
3607 return false;
3610 base = unshare_expr (DR_BASE_ADDRESS (dr));
3611 offset = unshare_expr (DR_OFFSET (dr));
3612 init = unshare_expr (DR_INIT (dr));
3614 if (is_gimple_call (stmt)
3615 && (!gimple_call_internal_p (stmt)
3616 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3617 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3619 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622 "not vectorized: dr in a call ");
3623 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3624 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3627 if (is_a <bb_vec_info> (vinfo))
3628 break;
3630 if (gatherscatter != SG_NONE || simd_lane_access)
3631 free_data_ref (dr);
3632 return false;
3635 /* Update DR field in stmt_vec_info struct. */
3637 /* If the dataref is in an inner-loop of the loop that is considered for
3638 for vectorization, we also want to analyze the access relative to
3639 the outer-loop (DR contains information only relative to the
3640 inner-most enclosing loop). We do that by building a reference to the
3641 first location accessed by the inner-loop, and analyze it relative to
3642 the outer-loop. */
3643 if (loop && nested_in_vect_loop_p (loop, stmt))
3645 tree outer_step, outer_base, outer_init;
3646 HOST_WIDE_INT pbitsize, pbitpos;
3647 tree poffset;
3648 machine_mode pmode;
3649 int punsignedp, preversep, pvolatilep;
3650 affine_iv base_iv, offset_iv;
3651 tree dinit;
3653 /* Build a reference to the first location accessed by the
3654 inner-loop: *(BASE+INIT). (The first location is actually
3655 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3656 tree inner_base = build_fold_indirect_ref
3657 (fold_build_pointer_plus (base, init));
3659 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_NOTE, vect_location,
3662 "analyze in outer-loop: ");
3663 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3664 dump_printf (MSG_NOTE, "\n");
3667 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3668 &poffset, &pmode, &punsignedp,
3669 &preversep, &pvolatilep, false);
3670 gcc_assert (outer_base != NULL_TREE);
3672 if (pbitpos % BITS_PER_UNIT != 0)
3674 if (dump_enabled_p ())
3675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3676 "failed: bit offset alignment.\n");
3677 return false;
3680 if (preversep)
3682 if (dump_enabled_p ())
3683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3684 "failed: reverse storage order.\n");
3685 return false;
3688 outer_base = build_fold_addr_expr (outer_base);
3689 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3690 &base_iv, false))
3692 if (dump_enabled_p ())
3693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3694 "failed: evolution of base is not affine.\n");
3695 return false;
3698 if (offset)
3700 if (poffset)
3701 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3702 poffset);
3703 else
3704 poffset = offset;
3707 if (!poffset)
3709 offset_iv.base = ssize_int (0);
3710 offset_iv.step = ssize_int (0);
3712 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3713 &offset_iv, false))
3715 if (dump_enabled_p ())
3716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3717 "evolution of offset is not affine.\n");
3718 return false;
3721 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3722 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3723 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3724 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3725 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3727 outer_step = size_binop (PLUS_EXPR,
3728 fold_convert (ssizetype, base_iv.step),
3729 fold_convert (ssizetype, offset_iv.step));
3731 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3732 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3733 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3734 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3735 STMT_VINFO_DR_OFFSET (stmt_info) =
3736 fold_convert (ssizetype, offset_iv.base);
3737 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3738 size_int (highest_pow2_factor (offset_iv.base));
3740 if (dump_enabled_p ())
3742 dump_printf_loc (MSG_NOTE, vect_location,
3743 "\touter base_address: ");
3744 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3745 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3746 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3747 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3748 STMT_VINFO_DR_OFFSET (stmt_info));
3749 dump_printf (MSG_NOTE,
3750 "\n\touter constant offset from base address: ");
3751 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3752 STMT_VINFO_DR_INIT (stmt_info));
3753 dump_printf (MSG_NOTE, "\n\touter step: ");
3754 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3755 STMT_VINFO_DR_STEP (stmt_info));
3756 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3757 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3758 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3759 dump_printf (MSG_NOTE, "\n");
3763 if (STMT_VINFO_DATA_REF (stmt_info))
3765 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3768 "not vectorized: more than one data ref "
3769 "in stmt: ");
3770 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3771 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3774 if (is_a <bb_vec_info> (vinfo))
3775 break;
3777 if (gatherscatter != SG_NONE || simd_lane_access)
3778 free_data_ref (dr);
3779 return false;
3782 STMT_VINFO_DATA_REF (stmt_info) = dr;
3783 if (simd_lane_access)
3785 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3786 free_data_ref (datarefs[i]);
3787 datarefs[i] = dr;
3790 /* Set vectype for STMT. */
3791 scalar_type = TREE_TYPE (DR_REF (dr));
3792 STMT_VINFO_VECTYPE (stmt_info)
3793 = get_vectype_for_scalar_type (scalar_type);
3794 if (!STMT_VINFO_VECTYPE (stmt_info))
3796 if (dump_enabled_p ())
3798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3799 "not vectorized: no vectype for stmt: ");
3800 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3801 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3802 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3803 scalar_type);
3804 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3807 if (is_a <bb_vec_info> (vinfo))
3809 /* No vector type is fine, the ref can still participate
3810 in dependence analysis, we just can't vectorize it. */
3811 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3812 continue;
3815 if (gatherscatter != SG_NONE || simd_lane_access)
3817 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3818 if (gatherscatter != SG_NONE)
3819 free_data_ref (dr);
3821 return false;
3823 else
3825 if (dump_enabled_p ())
3827 dump_printf_loc (MSG_NOTE, vect_location,
3828 "got vectype for stmt: ");
3829 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3830 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3831 STMT_VINFO_VECTYPE (stmt_info));
3832 dump_printf (MSG_NOTE, "\n");
3836 /* Adjust the minimal vectorization factor according to the
3837 vector type. */
3838 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3839 if (vf > *min_vf)
3840 *min_vf = vf;
3842 if (gatherscatter != SG_NONE)
3844 tree off;
3845 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3846 NULL, &off, NULL)
3847 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3849 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3850 free_data_ref (dr);
3851 if (dump_enabled_p ())
3853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3854 (gatherscatter == GATHER) ?
3855 "not vectorized: not suitable for gather "
3856 "load " :
3857 "not vectorized: not suitable for scatter "
3858 "store ");
3859 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3860 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3862 return false;
3865 free_data_ref (datarefs[i]);
3866 datarefs[i] = dr;
3867 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3870 else if (is_a <loop_vec_info> (vinfo)
3871 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3873 if (nested_in_vect_loop_p (loop, stmt))
3875 if (dump_enabled_p ())
3877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3878 "not vectorized: not suitable for strided "
3879 "load ");
3880 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3881 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3883 return false;
3885 STMT_VINFO_STRIDED_P (stmt_info) = true;
3889 /* If we stopped analysis at the first dataref we could not analyze
3890 when trying to vectorize a basic-block mark the rest of the datarefs
3891 as not vectorizable and truncate the vector of datarefs. That
3892 avoids spending useless time in analyzing their dependence. */
3893 if (i != datarefs.length ())
3895 gcc_assert (is_a <bb_vec_info> (vinfo));
3896 for (unsigned j = i; j < datarefs.length (); ++j)
3898 data_reference_p dr = datarefs[j];
3899 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3900 free_data_ref (dr);
3902 datarefs.truncate (i);
3905 return true;
3909 /* Function vect_get_new_vect_var.
3911 Returns a name for a new variable. The current naming scheme appends the
3912 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3913 the name of vectorizer generated variables, and appends that to NAME if
3914 provided. */
3916 tree
3917 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3919 const char *prefix;
3920 tree new_vect_var;
3922 switch (var_kind)
3924 case vect_simple_var:
3925 prefix = "vect";
3926 break;
3927 case vect_scalar_var:
3928 prefix = "stmp";
3929 break;
3930 case vect_mask_var:
3931 prefix = "mask";
3932 break;
3933 case vect_pointer_var:
3934 prefix = "vectp";
3935 break;
3936 default:
3937 gcc_unreachable ();
3940 if (name)
3942 char* tmp = concat (prefix, "_", name, NULL);
3943 new_vect_var = create_tmp_reg (type, tmp);
3944 free (tmp);
3946 else
3947 new_vect_var = create_tmp_reg (type, prefix);
3949 return new_vect_var;
3952 /* Like vect_get_new_vect_var but return an SSA name. */
3954 tree
3955 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3957 const char *prefix;
3958 tree new_vect_var;
3960 switch (var_kind)
3962 case vect_simple_var:
3963 prefix = "vect";
3964 break;
3965 case vect_scalar_var:
3966 prefix = "stmp";
3967 break;
3968 case vect_pointer_var:
3969 prefix = "vectp";
3970 break;
3971 default:
3972 gcc_unreachable ();
3975 if (name)
3977 char* tmp = concat (prefix, "_", name, NULL);
3978 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3979 free (tmp);
3981 else
3982 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3984 return new_vect_var;
3987 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3989 static void
3990 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3991 stmt_vec_info stmt_info)
3993 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3994 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3995 int misalign = DR_MISALIGNMENT (dr);
3996 if (misalign == -1)
3997 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3998 else
3999 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4002 /* Function vect_create_addr_base_for_vector_ref.
4004 Create an expression that computes the address of the first memory location
4005 that will be accessed for a data reference.
4007 Input:
4008 STMT: The statement containing the data reference.
4009 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4010 OFFSET: Optional. If supplied, it is be added to the initial address.
4011 LOOP: Specify relative to which loop-nest should the address be computed.
4012 For example, when the dataref is in an inner-loop nested in an
4013 outer-loop that is now being vectorized, LOOP can be either the
4014 outer-loop, or the inner-loop. The first memory location accessed
4015 by the following dataref ('in' points to short):
4017 for (i=0; i<N; i++)
4018 for (j=0; j<M; j++)
4019 s += in[i+j]
4021 is as follows:
4022 if LOOP=i_loop: &in (relative to i_loop)
4023 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4024 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4025 initial address. Unlike OFFSET, which is number of elements to
4026 be added, BYTE_OFFSET is measured in bytes.
4028 Output:
4029 1. Return an SSA_NAME whose value is the address of the memory location of
4030 the first vector of the data reference.
4031 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4032 these statement(s) which define the returned SSA_NAME.
4034 FORNOW: We are only handling array accesses with step 1. */
4036 tree
4037 vect_create_addr_base_for_vector_ref (gimple *stmt,
4038 gimple_seq *new_stmt_list,
4039 tree offset,
4040 struct loop *loop,
4041 tree byte_offset)
4043 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4044 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4045 tree data_ref_base;
4046 const char *base_name;
4047 tree addr_base;
4048 tree dest;
4049 gimple_seq seq = NULL;
4050 tree base_offset;
4051 tree init;
4052 tree vect_ptr_type;
4053 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4054 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4056 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4058 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4060 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4062 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4063 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4064 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4066 else
4068 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4069 base_offset = unshare_expr (DR_OFFSET (dr));
4070 init = unshare_expr (DR_INIT (dr));
4073 if (loop_vinfo)
4074 base_name = get_name (data_ref_base);
4075 else
4077 base_offset = ssize_int (0);
4078 init = ssize_int (0);
4079 base_name = get_name (DR_REF (dr));
4082 /* Create base_offset */
4083 base_offset = size_binop (PLUS_EXPR,
4084 fold_convert (sizetype, base_offset),
4085 fold_convert (sizetype, init));
4087 if (offset)
4089 offset = fold_build2 (MULT_EXPR, sizetype,
4090 fold_convert (sizetype, offset), step);
4091 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4092 base_offset, offset);
4094 if (byte_offset)
4096 byte_offset = fold_convert (sizetype, byte_offset);
4097 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4098 base_offset, byte_offset);
4101 /* base + base_offset */
4102 if (loop_vinfo)
4103 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4104 else
4106 addr_base = build1 (ADDR_EXPR,
4107 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4108 unshare_expr (DR_REF (dr)));
4111 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4112 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4113 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4114 gimple_seq_add_seq (new_stmt_list, seq);
4116 if (DR_PTR_INFO (dr)
4117 && TREE_CODE (addr_base) == SSA_NAME
4118 && !SSA_NAME_PTR_INFO (addr_base))
4120 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4121 if (offset || byte_offset)
4122 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4125 if (dump_enabled_p ())
4127 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4128 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4129 dump_printf (MSG_NOTE, "\n");
4132 return addr_base;
4136 /* Function vect_create_data_ref_ptr.
4138 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4139 location accessed in the loop by STMT, along with the def-use update
4140 chain to appropriately advance the pointer through the loop iterations.
4141 Also set aliasing information for the pointer. This pointer is used by
4142 the callers to this function to create a memory reference expression for
4143 vector load/store access.
4145 Input:
4146 1. STMT: a stmt that references memory. Expected to be of the form
4147 GIMPLE_ASSIGN <name, data-ref> or
4148 GIMPLE_ASSIGN <data-ref, name>.
4149 2. AGGR_TYPE: the type of the reference, which should be either a vector
4150 or an array.
4151 3. AT_LOOP: the loop where the vector memref is to be created.
4152 4. OFFSET (optional): an offset to be added to the initial address accessed
4153 by the data-ref in STMT.
4154 5. BSI: location where the new stmts are to be placed if there is no loop
4155 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4156 pointing to the initial address.
4157 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4158 to the initial address accessed by the data-ref in STMT. This is
4159 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4160 in bytes.
4162 Output:
4163 1. Declare a new ptr to vector_type, and have it point to the base of the
4164 data reference (initial addressed accessed by the data reference).
4165 For example, for vector of type V8HI, the following code is generated:
4167 v8hi *ap;
4168 ap = (v8hi *)initial_address;
4170 if OFFSET is not supplied:
4171 initial_address = &a[init];
4172 if OFFSET is supplied:
4173 initial_address = &a[init + OFFSET];
4174 if BYTE_OFFSET is supplied:
4175 initial_address = &a[init] + BYTE_OFFSET;
4177 Return the initial_address in INITIAL_ADDRESS.
4179 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4180 update the pointer in each iteration of the loop.
4182 Return the increment stmt that updates the pointer in PTR_INCR.
4184 3. Set INV_P to true if the access pattern of the data reference in the
4185 vectorized loop is invariant. Set it to false otherwise.
4187 4. Return the pointer. */
4189 tree
4190 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4191 tree offset, tree *initial_address,
4192 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4193 bool only_init, bool *inv_p, tree byte_offset)
4195 const char *base_name;
4196 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4197 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4198 struct loop *loop = NULL;
4199 bool nested_in_vect_loop = false;
4200 struct loop *containing_loop = NULL;
4201 tree aggr_ptr_type;
4202 tree aggr_ptr;
4203 tree new_temp;
4204 gimple_seq new_stmt_list = NULL;
4205 edge pe = NULL;
4206 basic_block new_bb;
4207 tree aggr_ptr_init;
4208 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4209 tree aptr;
4210 gimple_stmt_iterator incr_gsi;
4211 bool insert_after;
4212 tree indx_before_incr, indx_after_incr;
4213 gimple *incr;
4214 tree step;
4215 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4217 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4218 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4220 if (loop_vinfo)
4222 loop = LOOP_VINFO_LOOP (loop_vinfo);
4223 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4224 containing_loop = (gimple_bb (stmt))->loop_father;
4225 pe = loop_preheader_edge (loop);
4227 else
4229 gcc_assert (bb_vinfo);
4230 only_init = true;
4231 *ptr_incr = NULL;
4234 /* Check the step (evolution) of the load in LOOP, and record
4235 whether it's invariant. */
4236 if (nested_in_vect_loop)
4237 step = STMT_VINFO_DR_STEP (stmt_info);
4238 else
4239 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4241 if (integer_zerop (step))
4242 *inv_p = true;
4243 else
4244 *inv_p = false;
4246 /* Create an expression for the first address accessed by this load
4247 in LOOP. */
4248 base_name = get_name (DR_BASE_ADDRESS (dr));
4250 if (dump_enabled_p ())
4252 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4253 dump_printf_loc (MSG_NOTE, vect_location,
4254 "create %s-pointer variable to type: ",
4255 get_tree_code_name (TREE_CODE (aggr_type)));
4256 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4257 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4258 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4259 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4260 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4261 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4262 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4263 else
4264 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4265 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4266 dump_printf (MSG_NOTE, "\n");
4269 /* (1) Create the new aggregate-pointer variable.
4270 Vector and array types inherit the alias set of their component
4271 type by default so we need to use a ref-all pointer if the data
4272 reference does not conflict with the created aggregated data
4273 reference because it is not addressable. */
4274 bool need_ref_all = false;
4275 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4276 get_alias_set (DR_REF (dr))))
4277 need_ref_all = true;
4278 /* Likewise for any of the data references in the stmt group. */
4279 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4281 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4284 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4285 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4286 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4287 get_alias_set (DR_REF (sdr))))
4289 need_ref_all = true;
4290 break;
4292 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4294 while (orig_stmt);
4296 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4297 need_ref_all);
4298 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4301 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4302 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4303 def-use update cycles for the pointer: one relative to the outer-loop
4304 (LOOP), which is what steps (3) and (4) below do. The other is relative
4305 to the inner-loop (which is the inner-most loop containing the dataref),
4306 and this is done be step (5) below.
4308 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4309 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4310 redundant. Steps (3),(4) create the following:
4312 vp0 = &base_addr;
4313 LOOP: vp1 = phi(vp0,vp2)
4316 vp2 = vp1 + step
4317 goto LOOP
4319 If there is an inner-loop nested in loop, then step (5) will also be
4320 applied, and an additional update in the inner-loop will be created:
4322 vp0 = &base_addr;
4323 LOOP: vp1 = phi(vp0,vp2)
4325 inner: vp3 = phi(vp1,vp4)
4326 vp4 = vp3 + inner_step
4327 if () goto inner
4329 vp2 = vp1 + step
4330 if () goto LOOP */
4332 /* (2) Calculate the initial address of the aggregate-pointer, and set
4333 the aggregate-pointer to point to it before the loop. */
4335 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4337 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4338 offset, loop, byte_offset);
4339 if (new_stmt_list)
4341 if (pe)
4343 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4344 gcc_assert (!new_bb);
4346 else
4347 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4350 *initial_address = new_temp;
4351 aggr_ptr_init = new_temp;
4353 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4354 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4355 inner-loop nested in LOOP (during outer-loop vectorization). */
4357 /* No update in loop is required. */
4358 if (only_init && (!loop_vinfo || at_loop == loop))
4359 aptr = aggr_ptr_init;
4360 else
4362 /* The step of the aggregate pointer is the type size. */
4363 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4364 /* One exception to the above is when the scalar step of the load in
4365 LOOP is zero. In this case the step here is also zero. */
4366 if (*inv_p)
4367 iv_step = size_zero_node;
4368 else if (tree_int_cst_sgn (step) == -1)
4369 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4371 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4373 create_iv (aggr_ptr_init,
4374 fold_convert (aggr_ptr_type, iv_step),
4375 aggr_ptr, loop, &incr_gsi, insert_after,
4376 &indx_before_incr, &indx_after_incr);
4377 incr = gsi_stmt (incr_gsi);
4378 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4380 /* Copy the points-to information if it exists. */
4381 if (DR_PTR_INFO (dr))
4383 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4384 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4386 if (ptr_incr)
4387 *ptr_incr = incr;
4389 aptr = indx_before_incr;
4392 if (!nested_in_vect_loop || only_init)
4393 return aptr;
4396 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4397 nested in LOOP, if exists. */
4399 gcc_assert (nested_in_vect_loop);
4400 if (!only_init)
4402 standard_iv_increment_position (containing_loop, &incr_gsi,
4403 &insert_after);
4404 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4405 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4406 &indx_after_incr);
4407 incr = gsi_stmt (incr_gsi);
4408 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4410 /* Copy the points-to information if it exists. */
4411 if (DR_PTR_INFO (dr))
4413 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4414 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4416 if (ptr_incr)
4417 *ptr_incr = incr;
4419 return indx_before_incr;
4421 else
4422 gcc_unreachable ();
4426 /* Function bump_vector_ptr
4428 Increment a pointer (to a vector type) by vector-size. If requested,
4429 i.e. if PTR-INCR is given, then also connect the new increment stmt
4430 to the existing def-use update-chain of the pointer, by modifying
4431 the PTR_INCR as illustrated below:
4433 The pointer def-use update-chain before this function:
4434 DATAREF_PTR = phi (p_0, p_2)
4435 ....
4436 PTR_INCR: p_2 = DATAREF_PTR + step
4438 The pointer def-use update-chain after this function:
4439 DATAREF_PTR = phi (p_0, p_2)
4440 ....
4441 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4442 ....
4443 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4445 Input:
4446 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4447 in the loop.
4448 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4449 the loop. The increment amount across iterations is expected
4450 to be vector_size.
4451 BSI - location where the new update stmt is to be placed.
4452 STMT - the original scalar memory-access stmt that is being vectorized.
4453 BUMP - optional. The offset by which to bump the pointer. If not given,
4454 the offset is assumed to be vector_size.
4456 Output: Return NEW_DATAREF_PTR as illustrated above.
4460 tree
4461 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4462 gimple *stmt, tree bump)
4464 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4465 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4466 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4467 tree update = TYPE_SIZE_UNIT (vectype);
4468 gassign *incr_stmt;
4469 ssa_op_iter iter;
4470 use_operand_p use_p;
4471 tree new_dataref_ptr;
4473 if (bump)
4474 update = bump;
4476 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4477 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4478 else
4479 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4480 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4481 dataref_ptr, update);
4482 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4484 /* Copy the points-to information if it exists. */
4485 if (DR_PTR_INFO (dr))
4487 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4488 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4491 if (!ptr_incr)
4492 return new_dataref_ptr;
4494 /* Update the vector-pointer's cross-iteration increment. */
4495 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4497 tree use = USE_FROM_PTR (use_p);
4499 if (use == dataref_ptr)
4500 SET_USE (use_p, new_dataref_ptr);
4501 else
4502 gcc_assert (tree_int_cst_compare (use, update) == 0);
4505 return new_dataref_ptr;
4509 /* Function vect_create_destination_var.
4511 Create a new temporary of type VECTYPE. */
4513 tree
4514 vect_create_destination_var (tree scalar_dest, tree vectype)
4516 tree vec_dest;
4517 const char *name;
4518 char *new_name;
4519 tree type;
4520 enum vect_var_kind kind;
4522 kind = vectype
4523 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4524 ? vect_mask_var
4525 : vect_simple_var
4526 : vect_scalar_var;
4527 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4529 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4531 name = get_name (scalar_dest);
4532 if (name)
4533 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4534 else
4535 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4536 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4537 free (new_name);
4539 return vec_dest;
4542 /* Function vect_grouped_store_supported.
4544 Returns TRUE if interleave high and interleave low permutations
4545 are supported, and FALSE otherwise. */
4547 bool
4548 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4550 machine_mode mode = TYPE_MODE (vectype);
4552 /* vect_permute_store_chain requires the group size to be equal to 3 or
4553 be a power of two. */
4554 if (count != 3 && exact_log2 (count) == -1)
4556 if (dump_enabled_p ())
4557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4558 "the size of the group of accesses"
4559 " is not a power of 2 or not eqaul to 3\n");
4560 return false;
4563 /* Check that the permutation is supported. */
4564 if (VECTOR_MODE_P (mode))
4566 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4567 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4569 if (count == 3)
4571 unsigned int j0 = 0, j1 = 0, j2 = 0;
4572 unsigned int i, j;
4574 for (j = 0; j < 3; j++)
4576 int nelt0 = ((3 - j) * nelt) % 3;
4577 int nelt1 = ((3 - j) * nelt + 1) % 3;
4578 int nelt2 = ((3 - j) * nelt + 2) % 3;
4579 for (i = 0; i < nelt; i++)
4581 if (3 * i + nelt0 < nelt)
4582 sel[3 * i + nelt0] = j0++;
4583 if (3 * i + nelt1 < nelt)
4584 sel[3 * i + nelt1] = nelt + j1++;
4585 if (3 * i + nelt2 < nelt)
4586 sel[3 * i + nelt2] = 0;
4588 if (!can_vec_perm_p (mode, false, sel))
4590 if (dump_enabled_p ())
4591 dump_printf (MSG_MISSED_OPTIMIZATION,
4592 "permutaion op not supported by target.\n");
4593 return false;
4596 for (i = 0; i < nelt; i++)
4598 if (3 * i + nelt0 < nelt)
4599 sel[3 * i + nelt0] = 3 * i + nelt0;
4600 if (3 * i + nelt1 < nelt)
4601 sel[3 * i + nelt1] = 3 * i + nelt1;
4602 if (3 * i + nelt2 < nelt)
4603 sel[3 * i + nelt2] = nelt + j2++;
4605 if (!can_vec_perm_p (mode, false, sel))
4607 if (dump_enabled_p ())
4608 dump_printf (MSG_MISSED_OPTIMIZATION,
4609 "permutaion op not supported by target.\n");
4610 return false;
4613 return true;
4615 else
4617 /* If length is not equal to 3 then only power of 2 is supported. */
4618 gcc_assert (exact_log2 (count) != -1);
4620 for (i = 0; i < nelt / 2; i++)
4622 sel[i * 2] = i;
4623 sel[i * 2 + 1] = i + nelt;
4625 if (can_vec_perm_p (mode, false, sel))
4627 for (i = 0; i < nelt; i++)
4628 sel[i] += nelt / 2;
4629 if (can_vec_perm_p (mode, false, sel))
4630 return true;
4635 if (dump_enabled_p ())
4636 dump_printf (MSG_MISSED_OPTIMIZATION,
4637 "permutaion op not supported by target.\n");
4638 return false;
4642 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4643 type VECTYPE. */
4645 bool
4646 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4648 return vect_lanes_optab_supported_p ("vec_store_lanes",
4649 vec_store_lanes_optab,
4650 vectype, count);
4654 /* Function vect_permute_store_chain.
4656 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4657 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4658 the data correctly for the stores. Return the final references for stores
4659 in RESULT_CHAIN.
4661 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4662 The input is 4 vectors each containing 8 elements. We assign a number to
4663 each element, the input sequence is:
4665 1st vec: 0 1 2 3 4 5 6 7
4666 2nd vec: 8 9 10 11 12 13 14 15
4667 3rd vec: 16 17 18 19 20 21 22 23
4668 4th vec: 24 25 26 27 28 29 30 31
4670 The output sequence should be:
4672 1st vec: 0 8 16 24 1 9 17 25
4673 2nd vec: 2 10 18 26 3 11 19 27
4674 3rd vec: 4 12 20 28 5 13 21 30
4675 4th vec: 6 14 22 30 7 15 23 31
4677 i.e., we interleave the contents of the four vectors in their order.
4679 We use interleave_high/low instructions to create such output. The input of
4680 each interleave_high/low operation is two vectors:
4681 1st vec 2nd vec
4682 0 1 2 3 4 5 6 7
4683 the even elements of the result vector are obtained left-to-right from the
4684 high/low elements of the first vector. The odd elements of the result are
4685 obtained left-to-right from the high/low elements of the second vector.
4686 The output of interleave_high will be: 0 4 1 5
4687 and of interleave_low: 2 6 3 7
4690 The permutation is done in log LENGTH stages. In each stage interleave_high
4691 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4692 where the first argument is taken from the first half of DR_CHAIN and the
4693 second argument from it's second half.
4694 In our example,
4696 I1: interleave_high (1st vec, 3rd vec)
4697 I2: interleave_low (1st vec, 3rd vec)
4698 I3: interleave_high (2nd vec, 4th vec)
4699 I4: interleave_low (2nd vec, 4th vec)
4701 The output for the first stage is:
4703 I1: 0 16 1 17 2 18 3 19
4704 I2: 4 20 5 21 6 22 7 23
4705 I3: 8 24 9 25 10 26 11 27
4706 I4: 12 28 13 29 14 30 15 31
4708 The output of the second stage, i.e. the final result is:
4710 I1: 0 8 16 24 1 9 17 25
4711 I2: 2 10 18 26 3 11 19 27
4712 I3: 4 12 20 28 5 13 21 30
4713 I4: 6 14 22 30 7 15 23 31. */
4715 void
4716 vect_permute_store_chain (vec<tree> dr_chain,
4717 unsigned int length,
4718 gimple *stmt,
4719 gimple_stmt_iterator *gsi,
4720 vec<tree> *result_chain)
4722 tree vect1, vect2, high, low;
4723 gimple *perm_stmt;
4724 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4725 tree perm_mask_low, perm_mask_high;
4726 tree data_ref;
4727 tree perm3_mask_low, perm3_mask_high;
4728 unsigned int i, n, log_length = exact_log2 (length);
4729 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4730 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4732 result_chain->quick_grow (length);
4733 memcpy (result_chain->address (), dr_chain.address (),
4734 length * sizeof (tree));
4736 if (length == 3)
4738 unsigned int j0 = 0, j1 = 0, j2 = 0;
4740 for (j = 0; j < 3; j++)
4742 int nelt0 = ((3 - j) * nelt) % 3;
4743 int nelt1 = ((3 - j) * nelt + 1) % 3;
4744 int nelt2 = ((3 - j) * nelt + 2) % 3;
4746 for (i = 0; i < nelt; i++)
4748 if (3 * i + nelt0 < nelt)
4749 sel[3 * i + nelt0] = j0++;
4750 if (3 * i + nelt1 < nelt)
4751 sel[3 * i + nelt1] = nelt + j1++;
4752 if (3 * i + nelt2 < nelt)
4753 sel[3 * i + nelt2] = 0;
4755 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4757 for (i = 0; i < nelt; i++)
4759 if (3 * i + nelt0 < nelt)
4760 sel[3 * i + nelt0] = 3 * i + nelt0;
4761 if (3 * i + nelt1 < nelt)
4762 sel[3 * i + nelt1] = 3 * i + nelt1;
4763 if (3 * i + nelt2 < nelt)
4764 sel[3 * i + nelt2] = nelt + j2++;
4766 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4768 vect1 = dr_chain[0];
4769 vect2 = dr_chain[1];
4771 /* Create interleaving stmt:
4772 low = VEC_PERM_EXPR <vect1, vect2,
4773 {j, nelt, *, j + 1, nelt + j + 1, *,
4774 j + 2, nelt + j + 2, *, ...}> */
4775 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4776 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4777 vect2, perm3_mask_low);
4778 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4780 vect1 = data_ref;
4781 vect2 = dr_chain[2];
4782 /* Create interleaving stmt:
4783 low = VEC_PERM_EXPR <vect1, vect2,
4784 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4785 6, 7, nelt + j + 2, ...}> */
4786 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4787 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4788 vect2, perm3_mask_high);
4789 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4790 (*result_chain)[j] = data_ref;
4793 else
4795 /* If length is not equal to 3 then only power of 2 is supported. */
4796 gcc_assert (exact_log2 (length) != -1);
4798 for (i = 0, n = nelt / 2; i < n; i++)
4800 sel[i * 2] = i;
4801 sel[i * 2 + 1] = i + nelt;
4803 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4805 for (i = 0; i < nelt; i++)
4806 sel[i] += nelt / 2;
4807 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4809 for (i = 0, n = log_length; i < n; i++)
4811 for (j = 0; j < length/2; j++)
4813 vect1 = dr_chain[j];
4814 vect2 = dr_chain[j+length/2];
4816 /* Create interleaving stmt:
4817 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4818 ...}> */
4819 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4820 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4821 vect2, perm_mask_high);
4822 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4823 (*result_chain)[2*j] = high;
4825 /* Create interleaving stmt:
4826 low = VEC_PERM_EXPR <vect1, vect2,
4827 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4828 ...}> */
4829 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4830 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4831 vect2, perm_mask_low);
4832 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4833 (*result_chain)[2*j+1] = low;
4835 memcpy (dr_chain.address (), result_chain->address (),
4836 length * sizeof (tree));
4841 /* Function vect_setup_realignment
4843 This function is called when vectorizing an unaligned load using
4844 the dr_explicit_realign[_optimized] scheme.
4845 This function generates the following code at the loop prolog:
4847 p = initial_addr;
4848 x msq_init = *(floor(p)); # prolog load
4849 realignment_token = call target_builtin;
4850 loop:
4851 x msq = phi (msq_init, ---)
4853 The stmts marked with x are generated only for the case of
4854 dr_explicit_realign_optimized.
4856 The code above sets up a new (vector) pointer, pointing to the first
4857 location accessed by STMT, and a "floor-aligned" load using that pointer.
4858 It also generates code to compute the "realignment-token" (if the relevant
4859 target hook was defined), and creates a phi-node at the loop-header bb
4860 whose arguments are the result of the prolog-load (created by this
4861 function) and the result of a load that takes place in the loop (to be
4862 created by the caller to this function).
4864 For the case of dr_explicit_realign_optimized:
4865 The caller to this function uses the phi-result (msq) to create the
4866 realignment code inside the loop, and sets up the missing phi argument,
4867 as follows:
4868 loop:
4869 msq = phi (msq_init, lsq)
4870 lsq = *(floor(p')); # load in loop
4871 result = realign_load (msq, lsq, realignment_token);
4873 For the case of dr_explicit_realign:
4874 loop:
4875 msq = *(floor(p)); # load in loop
4876 p' = p + (VS-1);
4877 lsq = *(floor(p')); # load in loop
4878 result = realign_load (msq, lsq, realignment_token);
4880 Input:
4881 STMT - (scalar) load stmt to be vectorized. This load accesses
4882 a memory location that may be unaligned.
4883 BSI - place where new code is to be inserted.
4884 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4885 is used.
4887 Output:
4888 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4889 target hook, if defined.
4890 Return value - the result of the loop-header phi node. */
4892 tree
4893 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4894 tree *realignment_token,
4895 enum dr_alignment_support alignment_support_scheme,
4896 tree init_addr,
4897 struct loop **at_loop)
4899 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4900 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4901 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4902 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4903 struct loop *loop = NULL;
4904 edge pe = NULL;
4905 tree scalar_dest = gimple_assign_lhs (stmt);
4906 tree vec_dest;
4907 gimple *inc;
4908 tree ptr;
4909 tree data_ref;
4910 basic_block new_bb;
4911 tree msq_init = NULL_TREE;
4912 tree new_temp;
4913 gphi *phi_stmt;
4914 tree msq = NULL_TREE;
4915 gimple_seq stmts = NULL;
4916 bool inv_p;
4917 bool compute_in_loop = false;
4918 bool nested_in_vect_loop = false;
4919 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4920 struct loop *loop_for_initial_load = NULL;
4922 if (loop_vinfo)
4924 loop = LOOP_VINFO_LOOP (loop_vinfo);
4925 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4928 gcc_assert (alignment_support_scheme == dr_explicit_realign
4929 || alignment_support_scheme == dr_explicit_realign_optimized);
4931 /* We need to generate three things:
4932 1. the misalignment computation
4933 2. the extra vector load (for the optimized realignment scheme).
4934 3. the phi node for the two vectors from which the realignment is
4935 done (for the optimized realignment scheme). */
4937 /* 1. Determine where to generate the misalignment computation.
4939 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4940 calculation will be generated by this function, outside the loop (in the
4941 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4942 caller, inside the loop.
4944 Background: If the misalignment remains fixed throughout the iterations of
4945 the loop, then both realignment schemes are applicable, and also the
4946 misalignment computation can be done outside LOOP. This is because we are
4947 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4948 are a multiple of VS (the Vector Size), and therefore the misalignment in
4949 different vectorized LOOP iterations is always the same.
4950 The problem arises only if the memory access is in an inner-loop nested
4951 inside LOOP, which is now being vectorized using outer-loop vectorization.
4952 This is the only case when the misalignment of the memory access may not
4953 remain fixed throughout the iterations of the inner-loop (as explained in
4954 detail in vect_supportable_dr_alignment). In this case, not only is the
4955 optimized realignment scheme not applicable, but also the misalignment
4956 computation (and generation of the realignment token that is passed to
4957 REALIGN_LOAD) have to be done inside the loop.
4959 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4960 or not, which in turn determines if the misalignment is computed inside
4961 the inner-loop, or outside LOOP. */
4963 if (init_addr != NULL_TREE || !loop_vinfo)
4965 compute_in_loop = true;
4966 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4970 /* 2. Determine where to generate the extra vector load.
4972 For the optimized realignment scheme, instead of generating two vector
4973 loads in each iteration, we generate a single extra vector load in the
4974 preheader of the loop, and in each iteration reuse the result of the
4975 vector load from the previous iteration. In case the memory access is in
4976 an inner-loop nested inside LOOP, which is now being vectorized using
4977 outer-loop vectorization, we need to determine whether this initial vector
4978 load should be generated at the preheader of the inner-loop, or can be
4979 generated at the preheader of LOOP. If the memory access has no evolution
4980 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4981 to be generated inside LOOP (in the preheader of the inner-loop). */
4983 if (nested_in_vect_loop)
4985 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4986 bool invariant_in_outerloop =
4987 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4988 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4990 else
4991 loop_for_initial_load = loop;
4992 if (at_loop)
4993 *at_loop = loop_for_initial_load;
4995 if (loop_for_initial_load)
4996 pe = loop_preheader_edge (loop_for_initial_load);
4998 /* 3. For the case of the optimized realignment, create the first vector
4999 load at the loop preheader. */
5001 if (alignment_support_scheme == dr_explicit_realign_optimized)
5003 /* Create msq_init = *(floor(p1)) in the loop preheader */
5004 gassign *new_stmt;
5006 gcc_assert (!compute_in_loop);
5007 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5008 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5009 NULL_TREE, &init_addr, NULL, &inc,
5010 true, &inv_p);
5011 if (TREE_CODE (ptr) == SSA_NAME)
5012 new_temp = copy_ssa_name (ptr);
5013 else
5014 new_temp = make_ssa_name (TREE_TYPE (ptr));
5015 new_stmt = gimple_build_assign
5016 (new_temp, BIT_AND_EXPR, ptr,
5017 build_int_cst (TREE_TYPE (ptr),
5018 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5019 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5020 gcc_assert (!new_bb);
5021 data_ref
5022 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5023 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5024 new_stmt = gimple_build_assign (vec_dest, data_ref);
5025 new_temp = make_ssa_name (vec_dest, new_stmt);
5026 gimple_assign_set_lhs (new_stmt, new_temp);
5027 if (pe)
5029 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5030 gcc_assert (!new_bb);
5032 else
5033 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5035 msq_init = gimple_assign_lhs (new_stmt);
5038 /* 4. Create realignment token using a target builtin, if available.
5039 It is done either inside the containing loop, or before LOOP (as
5040 determined above). */
5042 if (targetm.vectorize.builtin_mask_for_load)
5044 gcall *new_stmt;
5045 tree builtin_decl;
5047 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5048 if (!init_addr)
5050 /* Generate the INIT_ADDR computation outside LOOP. */
5051 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5052 NULL_TREE, loop);
5053 if (loop)
5055 pe = loop_preheader_edge (loop);
5056 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5057 gcc_assert (!new_bb);
5059 else
5060 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5063 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5064 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5065 vec_dest =
5066 vect_create_destination_var (scalar_dest,
5067 gimple_call_return_type (new_stmt));
5068 new_temp = make_ssa_name (vec_dest, new_stmt);
5069 gimple_call_set_lhs (new_stmt, new_temp);
5071 if (compute_in_loop)
5072 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5073 else
5075 /* Generate the misalignment computation outside LOOP. */
5076 pe = loop_preheader_edge (loop);
5077 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5078 gcc_assert (!new_bb);
5081 *realignment_token = gimple_call_lhs (new_stmt);
5083 /* The result of the CALL_EXPR to this builtin is determined from
5084 the value of the parameter and no global variables are touched
5085 which makes the builtin a "const" function. Requiring the
5086 builtin to have the "const" attribute makes it unnecessary
5087 to call mark_call_clobbered. */
5088 gcc_assert (TREE_READONLY (builtin_decl));
5091 if (alignment_support_scheme == dr_explicit_realign)
5092 return msq;
5094 gcc_assert (!compute_in_loop);
5095 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5098 /* 5. Create msq = phi <msq_init, lsq> in loop */
5100 pe = loop_preheader_edge (containing_loop);
5101 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5102 msq = make_ssa_name (vec_dest);
5103 phi_stmt = create_phi_node (msq, containing_loop->header);
5104 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5106 return msq;
5110 /* Function vect_grouped_load_supported.
5112 Returns TRUE if even and odd permutations are supported,
5113 and FALSE otherwise. */
5115 bool
5116 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5118 machine_mode mode = TYPE_MODE (vectype);
5120 /* vect_permute_load_chain requires the group size to be equal to 3 or
5121 be a power of two. */
5122 if (count != 3 && exact_log2 (count) == -1)
5124 if (dump_enabled_p ())
5125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5126 "the size of the group of accesses"
5127 " is not a power of 2 or not equal to 3\n");
5128 return false;
5131 /* Check that the permutation is supported. */
5132 if (VECTOR_MODE_P (mode))
5134 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5135 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5137 if (count == 3)
5139 unsigned int k;
5140 for (k = 0; k < 3; k++)
5142 for (i = 0; i < nelt; i++)
5143 if (3 * i + k < 2 * nelt)
5144 sel[i] = 3 * i + k;
5145 else
5146 sel[i] = 0;
5147 if (!can_vec_perm_p (mode, false, sel))
5149 if (dump_enabled_p ())
5150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5151 "shuffle of 3 loads is not supported by"
5152 " target\n");
5153 return false;
5155 for (i = 0, j = 0; i < nelt; i++)
5156 if (3 * i + k < 2 * nelt)
5157 sel[i] = i;
5158 else
5159 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5160 if (!can_vec_perm_p (mode, false, sel))
5162 if (dump_enabled_p ())
5163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5164 "shuffle of 3 loads is not supported by"
5165 " target\n");
5166 return false;
5169 return true;
5171 else
5173 /* If length is not equal to 3 then only power of 2 is supported. */
5174 gcc_assert (exact_log2 (count) != -1);
5175 for (i = 0; i < nelt; i++)
5176 sel[i] = i * 2;
5177 if (can_vec_perm_p (mode, false, sel))
5179 for (i = 0; i < nelt; i++)
5180 sel[i] = i * 2 + 1;
5181 if (can_vec_perm_p (mode, false, sel))
5182 return true;
5187 if (dump_enabled_p ())
5188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5189 "extract even/odd not supported by target\n");
5190 return false;
5193 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5194 type VECTYPE. */
5196 bool
5197 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5199 return vect_lanes_optab_supported_p ("vec_load_lanes",
5200 vec_load_lanes_optab,
5201 vectype, count);
5204 /* Function vect_permute_load_chain.
5206 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5207 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5208 the input data correctly. Return the final references for loads in
5209 RESULT_CHAIN.
5211 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5212 The input is 4 vectors each containing 8 elements. We assign a number to each
5213 element, the input sequence is:
5215 1st vec: 0 1 2 3 4 5 6 7
5216 2nd vec: 8 9 10 11 12 13 14 15
5217 3rd vec: 16 17 18 19 20 21 22 23
5218 4th vec: 24 25 26 27 28 29 30 31
5220 The output sequence should be:
5222 1st vec: 0 4 8 12 16 20 24 28
5223 2nd vec: 1 5 9 13 17 21 25 29
5224 3rd vec: 2 6 10 14 18 22 26 30
5225 4th vec: 3 7 11 15 19 23 27 31
5227 i.e., the first output vector should contain the first elements of each
5228 interleaving group, etc.
5230 We use extract_even/odd instructions to create such output. The input of
5231 each extract_even/odd operation is two vectors
5232 1st vec 2nd vec
5233 0 1 2 3 4 5 6 7
5235 and the output is the vector of extracted even/odd elements. The output of
5236 extract_even will be: 0 2 4 6
5237 and of extract_odd: 1 3 5 7
5240 The permutation is done in log LENGTH stages. In each stage extract_even
5241 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5242 their order. In our example,
5244 E1: extract_even (1st vec, 2nd vec)
5245 E2: extract_odd (1st vec, 2nd vec)
5246 E3: extract_even (3rd vec, 4th vec)
5247 E4: extract_odd (3rd vec, 4th vec)
5249 The output for the first stage will be:
5251 E1: 0 2 4 6 8 10 12 14
5252 E2: 1 3 5 7 9 11 13 15
5253 E3: 16 18 20 22 24 26 28 30
5254 E4: 17 19 21 23 25 27 29 31
5256 In order to proceed and create the correct sequence for the next stage (or
5257 for the correct output, if the second stage is the last one, as in our
5258 example), we first put the output of extract_even operation and then the
5259 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5260 The input for the second stage is:
5262 1st vec (E1): 0 2 4 6 8 10 12 14
5263 2nd vec (E3): 16 18 20 22 24 26 28 30
5264 3rd vec (E2): 1 3 5 7 9 11 13 15
5265 4th vec (E4): 17 19 21 23 25 27 29 31
5267 The output of the second stage:
5269 E1: 0 4 8 12 16 20 24 28
5270 E2: 2 6 10 14 18 22 26 30
5271 E3: 1 5 9 13 17 21 25 29
5272 E4: 3 7 11 15 19 23 27 31
5274 And RESULT_CHAIN after reordering:
5276 1st vec (E1): 0 4 8 12 16 20 24 28
5277 2nd vec (E3): 1 5 9 13 17 21 25 29
5278 3rd vec (E2): 2 6 10 14 18 22 26 30
5279 4th vec (E4): 3 7 11 15 19 23 27 31. */
5281 static void
5282 vect_permute_load_chain (vec<tree> dr_chain,
5283 unsigned int length,
5284 gimple *stmt,
5285 gimple_stmt_iterator *gsi,
5286 vec<tree> *result_chain)
5288 tree data_ref, first_vect, second_vect;
5289 tree perm_mask_even, perm_mask_odd;
5290 tree perm3_mask_low, perm3_mask_high;
5291 gimple *perm_stmt;
5292 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5293 unsigned int i, j, log_length = exact_log2 (length);
5294 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5295 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5297 result_chain->quick_grow (length);
5298 memcpy (result_chain->address (), dr_chain.address (),
5299 length * sizeof (tree));
5301 if (length == 3)
5303 unsigned int k;
5305 for (k = 0; k < 3; k++)
5307 for (i = 0; i < nelt; i++)
5308 if (3 * i + k < 2 * nelt)
5309 sel[i] = 3 * i + k;
5310 else
5311 sel[i] = 0;
5312 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5314 for (i = 0, j = 0; i < nelt; i++)
5315 if (3 * i + k < 2 * nelt)
5316 sel[i] = i;
5317 else
5318 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5320 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5322 first_vect = dr_chain[0];
5323 second_vect = dr_chain[1];
5325 /* Create interleaving stmt (low part of):
5326 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5327 ...}> */
5328 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5329 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5330 second_vect, perm3_mask_low);
5331 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5333 /* Create interleaving stmt (high part of):
5334 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5335 ...}> */
5336 first_vect = data_ref;
5337 second_vect = dr_chain[2];
5338 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5339 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5340 second_vect, perm3_mask_high);
5341 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5342 (*result_chain)[k] = data_ref;
5345 else
5347 /* If length is not equal to 3 then only power of 2 is supported. */
5348 gcc_assert (exact_log2 (length) != -1);
5350 for (i = 0; i < nelt; ++i)
5351 sel[i] = i * 2;
5352 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5354 for (i = 0; i < nelt; ++i)
5355 sel[i] = i * 2 + 1;
5356 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5358 for (i = 0; i < log_length; i++)
5360 for (j = 0; j < length; j += 2)
5362 first_vect = dr_chain[j];
5363 second_vect = dr_chain[j+1];
5365 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5366 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5367 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5368 first_vect, second_vect,
5369 perm_mask_even);
5370 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371 (*result_chain)[j/2] = data_ref;
5373 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5374 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5375 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5376 first_vect, second_vect,
5377 perm_mask_odd);
5378 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5379 (*result_chain)[j/2+length/2] = data_ref;
5381 memcpy (dr_chain.address (), result_chain->address (),
5382 length * sizeof (tree));
5387 /* Function vect_shift_permute_load_chain.
5389 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5390 sequence of stmts to reorder the input data accordingly.
5391 Return the final references for loads in RESULT_CHAIN.
5392 Return true if successed, false otherwise.
5394 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5395 The input is 3 vectors each containing 8 elements. We assign a
5396 number to each element, the input sequence is:
5398 1st vec: 0 1 2 3 4 5 6 7
5399 2nd vec: 8 9 10 11 12 13 14 15
5400 3rd vec: 16 17 18 19 20 21 22 23
5402 The output sequence should be:
5404 1st vec: 0 3 6 9 12 15 18 21
5405 2nd vec: 1 4 7 10 13 16 19 22
5406 3rd vec: 2 5 8 11 14 17 20 23
5408 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5410 First we shuffle all 3 vectors to get correct elements order:
5412 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5413 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5414 3rd vec: (16 19 22) (17 20 23) (18 21)
5416 Next we unite and shift vector 3 times:
5418 1st step:
5419 shift right by 6 the concatenation of:
5420 "1st vec" and "2nd vec"
5421 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5422 "2nd vec" and "3rd vec"
5423 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5424 "3rd vec" and "1st vec"
5425 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5426 | New vectors |
5428 So that now new vectors are:
5430 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5431 2nd vec: (10 13) (16 19 22) (17 20 23)
5432 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5434 2nd step:
5435 shift right by 5 the concatenation of:
5436 "1st vec" and "3rd vec"
5437 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5438 "2nd vec" and "1st vec"
5439 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5440 "3rd vec" and "2nd vec"
5441 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5442 | New vectors |
5444 So that now new vectors are:
5446 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5447 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5448 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5450 3rd step:
5451 shift right by 5 the concatenation of:
5452 "1st vec" and "1st vec"
5453 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5454 shift right by 3 the concatenation of:
5455 "2nd vec" and "2nd vec"
5456 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5457 | New vectors |
5459 So that now all vectors are READY:
5460 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5461 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5462 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5464 This algorithm is faster than one in vect_permute_load_chain if:
5465 1. "shift of a concatination" is faster than general permutation.
5466 This is usually so.
5467 2. The TARGET machine can't execute vector instructions in parallel.
5468 This is because each step of the algorithm depends on previous.
5469 The algorithm in vect_permute_load_chain is much more parallel.
5471 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5474 static bool
5475 vect_shift_permute_load_chain (vec<tree> dr_chain,
5476 unsigned int length,
5477 gimple *stmt,
5478 gimple_stmt_iterator *gsi,
5479 vec<tree> *result_chain)
5481 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5482 tree perm2_mask1, perm2_mask2, perm3_mask;
5483 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5484 gimple *perm_stmt;
5486 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5487 unsigned int i;
5488 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5489 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5490 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5491 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5493 result_chain->quick_grow (length);
5494 memcpy (result_chain->address (), dr_chain.address (),
5495 length * sizeof (tree));
5497 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5499 unsigned int j, log_length = exact_log2 (length);
5500 for (i = 0; i < nelt / 2; ++i)
5501 sel[i] = i * 2;
5502 for (i = 0; i < nelt / 2; ++i)
5503 sel[nelt / 2 + i] = i * 2 + 1;
5504 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5506 if (dump_enabled_p ())
5507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5508 "shuffle of 2 fields structure is not \
5509 supported by target\n");
5510 return false;
5512 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5514 for (i = 0; i < nelt / 2; ++i)
5515 sel[i] = i * 2 + 1;
5516 for (i = 0; i < nelt / 2; ++i)
5517 sel[nelt / 2 + i] = i * 2;
5518 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5520 if (dump_enabled_p ())
5521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522 "shuffle of 2 fields structure is not \
5523 supported by target\n");
5524 return false;
5526 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5528 /* Generating permutation constant to shift all elements.
5529 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5530 for (i = 0; i < nelt; i++)
5531 sel[i] = nelt / 2 + i;
5532 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5534 if (dump_enabled_p ())
5535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5536 "shift permutation is not supported by target\n");
5537 return false;
5539 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5541 /* Generating permutation constant to select vector from 2.
5542 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5543 for (i = 0; i < nelt / 2; i++)
5544 sel[i] = i;
5545 for (i = nelt / 2; i < nelt; i++)
5546 sel[i] = nelt + i;
5547 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5549 if (dump_enabled_p ())
5550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5551 "select is not supported by target\n");
5552 return false;
5554 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5556 for (i = 0; i < log_length; i++)
5558 for (j = 0; j < length; j += 2)
5560 first_vect = dr_chain[j];
5561 second_vect = dr_chain[j + 1];
5563 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5564 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5565 first_vect, first_vect,
5566 perm2_mask1);
5567 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5568 vect[0] = data_ref;
5570 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5571 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5572 second_vect, second_vect,
5573 perm2_mask2);
5574 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5575 vect[1] = data_ref;
5577 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5578 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5579 vect[0], vect[1], shift1_mask);
5580 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5581 (*result_chain)[j/2 + length/2] = data_ref;
5583 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5584 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5585 vect[0], vect[1], select_mask);
5586 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5587 (*result_chain)[j/2] = data_ref;
5589 memcpy (dr_chain.address (), result_chain->address (),
5590 length * sizeof (tree));
5592 return true;
5594 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5596 unsigned int k = 0, l = 0;
5598 /* Generating permutation constant to get all elements in rigth order.
5599 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5600 for (i = 0; i < nelt; i++)
5602 if (3 * k + (l % 3) >= nelt)
5604 k = 0;
5605 l += (3 - (nelt % 3));
5607 sel[i] = 3 * k + (l % 3);
5608 k++;
5610 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5612 if (dump_enabled_p ())
5613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5614 "shuffle of 3 fields structure is not \
5615 supported by target\n");
5616 return false;
5618 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5620 /* Generating permutation constant to shift all elements.
5621 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5622 for (i = 0; i < nelt; i++)
5623 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5624 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5626 if (dump_enabled_p ())
5627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5628 "shift permutation is not supported by target\n");
5629 return false;
5631 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5633 /* Generating permutation constant to shift all elements.
5634 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5635 for (i = 0; i < nelt; i++)
5636 sel[i] = 2 * (nelt / 3) + 1 + i;
5637 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5639 if (dump_enabled_p ())
5640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5641 "shift permutation is not supported by target\n");
5642 return false;
5644 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5646 /* Generating permutation constant to shift all elements.
5647 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5648 for (i = 0; i < nelt; i++)
5649 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5650 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654 "shift permutation is not supported by target\n");
5655 return false;
5657 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5659 /* Generating permutation constant to shift all elements.
5660 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5661 for (i = 0; i < nelt; i++)
5662 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5663 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5665 if (dump_enabled_p ())
5666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667 "shift permutation is not supported by target\n");
5668 return false;
5670 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5672 for (k = 0; k < 3; k++)
5674 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5675 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5676 dr_chain[k], dr_chain[k],
5677 perm3_mask);
5678 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5679 vect[k] = data_ref;
5682 for (k = 0; k < 3; k++)
5684 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5685 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5686 vect[k % 3], vect[(k + 1) % 3],
5687 shift1_mask);
5688 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5689 vect_shift[k] = data_ref;
5692 for (k = 0; k < 3; k++)
5694 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5695 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5696 vect_shift[(4 - k) % 3],
5697 vect_shift[(3 - k) % 3],
5698 shift2_mask);
5699 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5700 vect[k] = data_ref;
5703 (*result_chain)[3 - (nelt % 3)] = vect[2];
5705 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5706 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5707 vect[0], shift3_mask);
5708 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5709 (*result_chain)[nelt % 3] = data_ref;
5711 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5712 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5713 vect[1], shift4_mask);
5714 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5715 (*result_chain)[0] = data_ref;
5716 return true;
5718 return false;
5721 /* Function vect_transform_grouped_load.
5723 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5724 to perform their permutation and ascribe the result vectorized statements to
5725 the scalar statements.
5728 void
5729 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5730 gimple_stmt_iterator *gsi)
5732 machine_mode mode;
5733 vec<tree> result_chain = vNULL;
5735 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5736 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5737 vectors, that are ready for vector computation. */
5738 result_chain.create (size);
5740 /* If reassociation width for vector type is 2 or greater target machine can
5741 execute 2 or more vector instructions in parallel. Otherwise try to
5742 get chain for loads group using vect_shift_permute_load_chain. */
5743 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5744 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5745 || exact_log2 (size) != -1
5746 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5747 gsi, &result_chain))
5748 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5749 vect_record_grouped_load_vectors (stmt, result_chain);
5750 result_chain.release ();
5753 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5754 generated as part of the vectorization of STMT. Assign the statement
5755 for each vector to the associated scalar statement. */
5757 void
5758 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5760 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5761 gimple *next_stmt, *new_stmt;
5762 unsigned int i, gap_count;
5763 tree tmp_data_ref;
5765 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5766 Since we scan the chain starting from it's first node, their order
5767 corresponds the order of data-refs in RESULT_CHAIN. */
5768 next_stmt = first_stmt;
5769 gap_count = 1;
5770 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5772 if (!next_stmt)
5773 break;
5775 /* Skip the gaps. Loads created for the gaps will be removed by dead
5776 code elimination pass later. No need to check for the first stmt in
5777 the group, since it always exists.
5778 GROUP_GAP is the number of steps in elements from the previous
5779 access (if there is no gap GROUP_GAP is 1). We skip loads that
5780 correspond to the gaps. */
5781 if (next_stmt != first_stmt
5782 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5784 gap_count++;
5785 continue;
5788 while (next_stmt)
5790 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5791 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5792 copies, and we put the new vector statement in the first available
5793 RELATED_STMT. */
5794 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5795 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5796 else
5798 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5800 gimple *prev_stmt =
5801 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5802 gimple *rel_stmt =
5803 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5804 while (rel_stmt)
5806 prev_stmt = rel_stmt;
5807 rel_stmt =
5808 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5811 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5812 new_stmt;
5816 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5817 gap_count = 1;
5818 /* If NEXT_STMT accesses the same DR as the previous statement,
5819 put the same TMP_DATA_REF as its vectorized statement; otherwise
5820 get the next data-ref from RESULT_CHAIN. */
5821 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5822 break;
5827 /* Function vect_force_dr_alignment_p.
5829 Returns whether the alignment of a DECL can be forced to be aligned
5830 on ALIGNMENT bit boundary. */
5832 bool
5833 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5835 if (TREE_CODE (decl) != VAR_DECL)
5836 return false;
5838 if (decl_in_symtab_p (decl)
5839 && !symtab_node::get (decl)->can_increase_alignment_p ())
5840 return false;
5842 if (TREE_STATIC (decl))
5843 return (alignment <= MAX_OFILE_ALIGNMENT);
5844 else
5845 return (alignment <= MAX_STACK_ALIGNMENT);
5849 /* Return whether the data reference DR is supported with respect to its
5850 alignment.
5851 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5852 it is aligned, i.e., check if it is possible to vectorize it with different
5853 alignment. */
5855 enum dr_alignment_support
5856 vect_supportable_dr_alignment (struct data_reference *dr,
5857 bool check_aligned_accesses)
5859 gimple *stmt = DR_STMT (dr);
5860 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5861 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5862 machine_mode mode = TYPE_MODE (vectype);
5863 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5864 struct loop *vect_loop = NULL;
5865 bool nested_in_vect_loop = false;
5867 if (aligned_access_p (dr) && !check_aligned_accesses)
5868 return dr_aligned;
5870 /* For now assume all conditional loads/stores support unaligned
5871 access without any special code. */
5872 if (is_gimple_call (stmt)
5873 && gimple_call_internal_p (stmt)
5874 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5875 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5876 return dr_unaligned_supported;
5878 if (loop_vinfo)
5880 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5881 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5884 /* Possibly unaligned access. */
5886 /* We can choose between using the implicit realignment scheme (generating
5887 a misaligned_move stmt) and the explicit realignment scheme (generating
5888 aligned loads with a REALIGN_LOAD). There are two variants to the
5889 explicit realignment scheme: optimized, and unoptimized.
5890 We can optimize the realignment only if the step between consecutive
5891 vector loads is equal to the vector size. Since the vector memory
5892 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5893 is guaranteed that the misalignment amount remains the same throughout the
5894 execution of the vectorized loop. Therefore, we can create the
5895 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5896 at the loop preheader.
5898 However, in the case of outer-loop vectorization, when vectorizing a
5899 memory access in the inner-loop nested within the LOOP that is now being
5900 vectorized, while it is guaranteed that the misalignment of the
5901 vectorized memory access will remain the same in different outer-loop
5902 iterations, it is *not* guaranteed that is will remain the same throughout
5903 the execution of the inner-loop. This is because the inner-loop advances
5904 with the original scalar step (and not in steps of VS). If the inner-loop
5905 step happens to be a multiple of VS, then the misalignment remains fixed
5906 and we can use the optimized realignment scheme. For example:
5908 for (i=0; i<N; i++)
5909 for (j=0; j<M; j++)
5910 s += a[i+j];
5912 When vectorizing the i-loop in the above example, the step between
5913 consecutive vector loads is 1, and so the misalignment does not remain
5914 fixed across the execution of the inner-loop, and the realignment cannot
5915 be optimized (as illustrated in the following pseudo vectorized loop):
5917 for (i=0; i<N; i+=4)
5918 for (j=0; j<M; j++){
5919 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5920 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5921 // (assuming that we start from an aligned address).
5924 We therefore have to use the unoptimized realignment scheme:
5926 for (i=0; i<N; i+=4)
5927 for (j=k; j<M; j+=4)
5928 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5929 // that the misalignment of the initial address is
5930 // 0).
5932 The loop can then be vectorized as follows:
5934 for (k=0; k<4; k++){
5935 rt = get_realignment_token (&vp[k]);
5936 for (i=0; i<N; i+=4){
5937 v1 = vp[i+k];
5938 for (j=k; j<M; j+=4){
5939 v2 = vp[i+j+VS-1];
5940 va = REALIGN_LOAD <v1,v2,rt>;
5941 vs += va;
5942 v1 = v2;
5945 } */
5947 if (DR_IS_READ (dr))
5949 bool is_packed = false;
5950 tree type = (TREE_TYPE (DR_REF (dr)));
5952 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5953 && (!targetm.vectorize.builtin_mask_for_load
5954 || targetm.vectorize.builtin_mask_for_load ()))
5956 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5957 if ((nested_in_vect_loop
5958 && (TREE_INT_CST_LOW (DR_STEP (dr))
5959 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5960 || !loop_vinfo)
5961 return dr_explicit_realign;
5962 else
5963 return dr_explicit_realign_optimized;
5965 if (!known_alignment_for_access_p (dr))
5966 is_packed = not_size_aligned (DR_REF (dr));
5968 if ((TYPE_USER_ALIGN (type) && !is_packed)
5969 || targetm.vectorize.
5970 support_vector_misalignment (mode, type,
5971 DR_MISALIGNMENT (dr), is_packed))
5972 /* Can't software pipeline the loads, but can at least do them. */
5973 return dr_unaligned_supported;
5975 else
5977 bool is_packed = false;
5978 tree type = (TREE_TYPE (DR_REF (dr)));
5980 if (!known_alignment_for_access_p (dr))
5981 is_packed = not_size_aligned (DR_REF (dr));
5983 if ((TYPE_USER_ALIGN (type) && !is_packed)
5984 || targetm.vectorize.
5985 support_vector_misalignment (mode, type,
5986 DR_MISALIGNMENT (dr), is_packed))
5987 return dr_unaligned_supported;
5990 /* Unsupported. */
5991 return dr_unaligned_unsupported;