Remove assert in get_def_bb_for_const
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob36d302a056ca4af0124a6cff9ed0c9e0fc318283
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
270 return true;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 return true;
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
344 if (dist == 0)
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
390 continue;
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
444 return true;
447 return false;
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
459 unsigned int i;
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
479 return true;
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
503 if (dra == drb)
504 return false;
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
554 group. */
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
560 continue;
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 continue;
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 if (!dr_b)
576 return false;
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
585 continue;
586 unsigned i;
587 gimple *store;
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
597 return false;
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
607 return false;
609 free_dependence_relation (ddr);
612 return true;
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632 store = NULL;
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
636 if (store)
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 return false;
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
647 bool res = true;
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
651 slp_tree load;
652 unsigned int i;
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
655 store
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
659 res = false;
660 break;
663 /* Unset the visited flag. */
664 if (store)
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
668 return res;
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
675 Output:
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
691 tree vectype;
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
694 tree aligned_to;
695 unsigned HOST_WIDE_INT alignment;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE, vect_location,
699 "vect_compute_data_ref_alignment:\n");
701 if (loop_vinfo)
702 loop = LOOP_VINFO_LOOP (loop_vinfo);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr)))
708 misalign = DR_INIT (dr);
709 aligned_to = DR_ALIGNED_TO (dr);
710 base_addr = DR_BASE_ADDRESS (dr);
711 vectype = STMT_VINFO_VECTYPE (stmt_info);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop && nested_in_vect_loop_p (loop, stmt))
721 tree step = DR_STEP (dr);
723 if (tree_fits_shwi_p (step)
724 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "inner step divides the vector-size.\n");
729 misalign = STMT_VINFO_DR_INIT (stmt_info);
730 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
731 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
733 else
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "inner step doesn't divide the vector-size.\n");
738 misalign = NULL_TREE;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
746 else
748 tree step = DR_STEP (dr);
749 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
751 if (tree_fits_shwi_p (step)
752 && ((tree_to_shwi (step) * vf)
753 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "step doesn't divide the vector-size.\n");
758 misalign = NULL_TREE;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
764 base = ref;
765 while (handled_component_p (base))
766 base = TREE_OPERAND (base, 0);
767 if (TREE_CODE (base) == MEM_REF)
768 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
770 unsigned int base_alignment = get_object_alignment (base);
772 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
773 DR_VECT_AUX (dr)->base_element_aligned = true;
775 alignment = TYPE_ALIGN_UNIT (vectype);
777 if ((compare_tree_int (aligned_to, alignment) < 0)
778 || !misalign)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
790 if (base_alignment < TYPE_ALIGN (vectype))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base, 1))
795 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
796 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
805 dump_printf (MSG_NOTE, "\n");
807 return true;
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
817 dump_printf (MSG_NOTE, "\n");
820 DR_VECT_AUX (dr)->base_decl = base;
821 DR_VECT_AUX (dr)->base_misaligned = true;
822 DR_VECT_AUX (dr)->base_element_aligned = true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
830 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
834 /* PLUS because DR_STEP was negative. */
835 misalign = size_binop (PLUS_EXPR, misalign, offset);
838 SET_DR_MISALIGNMENT (dr,
839 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
849 return true;
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
861 static void
862 vect_update_misalignment_for_peel (struct data_reference *dr,
863 struct data_reference *dr_peel, int npeel)
865 unsigned int i;
866 vec<dr_p> same_align_drs;
867 struct data_reference *current_dr;
868 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
869 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
870 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
871 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
876 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
878 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
882 same_align_drs
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
884 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
886 if (current_dr != dr)
887 continue;
888 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
889 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
890 SET_DR_MISALIGNMENT (dr, 0);
891 return;
894 if (known_alignment_for_access_p (dr)
895 && known_alignment_for_access_p (dr_peel))
897 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
898 int misal = DR_MISALIGNMENT (dr);
899 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
900 misal += negative ? -npeel * dr_size : npeel * dr_size;
901 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
902 SET_DR_MISALIGNMENT (dr, misal);
903 return;
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
916 static bool
917 verify_data_ref_alignment (data_reference_p dr)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr, false);
921 if (!supportable_dr_alignment)
923 if (dump_enabled_p ())
925 if (DR_IS_READ (dr))
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
927 "not vectorized: unsupported unaligned load.");
928 else
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "not vectorized: unsupported unaligned "
931 "store.");
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
934 DR_REF (dr));
935 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
937 return false;
940 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location,
942 "Vectorizing an unaligned access.\n");
944 return true;
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
952 bool
953 vect_verify_datarefs_alignment (loop_vec_info vinfo)
955 vec<data_reference_p> datarefs = vinfo->datarefs;
956 struct data_reference *dr;
957 unsigned int i;
959 FOR_EACH_VEC_ELT (datarefs, i, dr)
961 gimple *stmt = DR_STMT (dr);
962 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
965 continue;
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
969 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
970 continue;
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
976 continue;
978 if (! verify_data_ref_alignment (dr))
979 return false;
982 return true;
985 /* Given an memory reference EXP return whether its alignment is less
986 than its size. */
988 static bool
989 not_size_aligned (tree exp)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
992 return true;
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
995 > get_object_alignment (exp));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1003 static bool
1004 vector_alignment_reachable_p (struct data_reference *dr)
1006 gimple *stmt = DR_STMT (dr);
1007 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size, mis_in_elements;
1016 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr))
1020 return false;
1022 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1023 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1025 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1026 return false;
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1033 HOST_WIDE_INT elmsize =
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE, vect_location,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1039 dump_printf (MSG_NOTE,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1042 if (DR_MISALIGNMENT (dr) % elmsize)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "data size does not divide the misalignment.\n");
1047 return false;
1051 if (!known_alignment_for_access_p (dr))
1053 tree type = TREE_TYPE (DR_REF (dr));
1054 bool is_packed = not_size_aligned (DR_REF (dr));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Unknown misalignment, is_packed = %d\n",is_packed);
1058 if ((TYPE_USER_ALIGN (type) && !is_packed)
1059 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1060 return true;
1061 else
1062 return false;
1065 return true;
1069 /* Calculate the cost of the memory access represented by DR. */
1071 static void
1072 vect_get_data_access_cost (struct data_reference *dr,
1073 unsigned int *inside_cost,
1074 unsigned int *outside_cost,
1075 stmt_vector_for_cost *body_cost_vec)
1077 gimple *stmt = DR_STMT (dr);
1078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1079 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1081 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1082 int ncopies = vf / nunits;
1084 if (DR_IS_READ (dr))
1085 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1086 NULL, body_cost_vec, false);
1087 else
1088 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1097 typedef struct _vect_peel_info
1099 int npeel;
1100 struct data_reference *dr;
1101 unsigned int count;
1102 } *vect_peel_info;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info;
1107 unsigned int inside_cost;
1108 unsigned int outside_cost;
1109 stmt_vector_for_cost body_cost_vec;
1110 } *vect_peel_extended_info;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1117 static inline hashval_t hash (const _vect_peel_info *);
1118 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1121 inline hashval_t
1122 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1124 return (hashval_t) peel_info->npeel;
1127 inline bool
1128 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1130 return (a->npeel == b->npeel);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1136 static void
1137 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1138 loop_vec_info loop_vinfo, struct data_reference *dr,
1139 int npeel)
1141 struct _vect_peel_info elem, *slot;
1142 _vect_peel_info **new_slot;
1143 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1145 elem.npeel = npeel;
1146 slot = peeling_htab->find (&elem);
1147 if (slot)
1148 slot->count++;
1149 else
1151 slot = XNEW (struct _vect_peel_info);
1152 slot->npeel = npeel;
1153 slot->dr = dr;
1154 slot->count = 1;
1155 new_slot = peeling_htab->find_slot (slot, INSERT);
1156 *new_slot = slot;
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1161 slot->count += VECT_MAX_COST;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1170 _vect_peel_extended_info *max)
1172 vect_peel_info elem = *slot;
1174 if (elem->count > max->peel_info.count
1175 || (elem->count == max->peel_info.count
1176 && max->peel_info.npeel > elem->npeel))
1178 max->peel_info.npeel = elem->npeel;
1179 max->peel_info.count = elem->count;
1180 max->peel_info.dr = elem->dr;
1183 return 1;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1192 _vect_peel_extended_info *min)
1194 vect_peel_info elem = *slot;
1195 int save_misalignment, dummy;
1196 unsigned int inside_cost = 0, outside_cost = 0, i;
1197 gimple *stmt = DR_STMT (elem->dr);
1198 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1199 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1200 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1201 struct data_reference *dr;
1202 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1204 prologue_cost_vec.create (2);
1205 body_cost_vec.create (2);
1206 epilogue_cost_vec.create (2);
1208 FOR_EACH_VEC_ELT (datarefs, i, dr)
1210 stmt = DR_STMT (dr);
1211 stmt_info = vinfo_for_stmt (stmt);
1212 /* For interleaving, only the alignment of the first access
1213 matters. */
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1215 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1216 continue;
1218 /* Strided accesses perform only component accesses, alignment is
1219 irrelevant for them. */
1220 if (STMT_VINFO_STRIDED_P (stmt_info)
1221 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1222 continue;
1224 save_misalignment = DR_MISALIGNMENT (dr);
1225 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1226 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1227 &body_cost_vec);
1228 SET_DR_MISALIGNMENT (dr, save_misalignment);
1231 outside_cost += vect_get_known_peeling_cost
1232 (loop_vinfo, elem->npeel, &dummy,
1233 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1234 &prologue_cost_vec, &epilogue_cost_vec);
1236 /* Prologue and epilogue costs are added to the target model later.
1237 These costs depend only on the scalar iteration cost, the
1238 number of peeling iterations finally chosen, and the number of
1239 misaligned statements. So discard the information found here. */
1240 prologue_cost_vec.release ();
1241 epilogue_cost_vec.release ();
1243 if (inside_cost < min->inside_cost
1244 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1246 min->inside_cost = inside_cost;
1247 min->outside_cost = outside_cost;
1248 min->body_cost_vec.release ();
1249 min->body_cost_vec = body_cost_vec;
1250 min->peel_info.dr = elem->dr;
1251 min->peel_info.npeel = elem->npeel;
1253 else
1254 body_cost_vec.release ();
1256 return 1;
1260 /* Choose best peeling option by traversing peeling hash table and either
1261 choosing an option with the lowest cost (if cost model is enabled) or the
1262 option that aligns as many accesses as possible. */
1264 static struct data_reference *
1265 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1266 loop_vec_info loop_vinfo,
1267 unsigned int *npeel,
1268 stmt_vector_for_cost *body_cost_vec)
1270 struct _vect_peel_extended_info res;
1272 res.peel_info.dr = NULL;
1273 res.body_cost_vec = stmt_vector_for_cost ();
1275 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1277 res.inside_cost = INT_MAX;
1278 res.outside_cost = INT_MAX;
1279 peeling_htab->traverse <_vect_peel_extended_info *,
1280 vect_peeling_hash_get_lowest_cost> (&res);
1282 else
1284 res.peel_info.count = 0;
1285 peeling_htab->traverse <_vect_peel_extended_info *,
1286 vect_peeling_hash_get_most_frequent> (&res);
1289 *npeel = res.peel_info.npeel;
1290 *body_cost_vec = res.body_cost_vec;
1291 return res.peel_info.dr;
1295 /* Function vect_enhance_data_refs_alignment
1297 This pass will use loop versioning and loop peeling in order to enhance
1298 the alignment of data references in the loop.
1300 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1301 original loop is to be vectorized. Any other loops that are created by
1302 the transformations performed in this pass - are not supposed to be
1303 vectorized. This restriction will be relaxed.
1305 This pass will require a cost model to guide it whether to apply peeling
1306 or versioning or a combination of the two. For example, the scheme that
1307 intel uses when given a loop with several memory accesses, is as follows:
1308 choose one memory access ('p') which alignment you want to force by doing
1309 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1310 other accesses are not necessarily aligned, or (2) use loop versioning to
1311 generate one loop in which all accesses are aligned, and another loop in
1312 which only 'p' is necessarily aligned.
1314 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1315 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1316 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1318 Devising a cost model is the most critical aspect of this work. It will
1319 guide us on which access to peel for, whether to use loop versioning, how
1320 many versions to create, etc. The cost model will probably consist of
1321 generic considerations as well as target specific considerations (on
1322 powerpc for example, misaligned stores are more painful than misaligned
1323 loads).
1325 Here are the general steps involved in alignment enhancements:
1327 -- original loop, before alignment analysis:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- After vect_compute_data_refs_alignment:
1334 for (i=0; i<N; i++){
1335 x = q[i]; # DR_MISALIGNMENT(q) = 3
1336 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1339 -- Possibility 1: we do loop versioning:
1340 if (p is aligned) {
1341 for (i=0; i<N; i++){ # loop 1A
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = 0
1346 else {
1347 for (i=0; i<N; i++){ # loop 1B
1348 x = q[i]; # DR_MISALIGNMENT(q) = 3
1349 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1353 -- Possibility 2: we do loop peeling:
1354 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1355 x = q[i];
1356 p[i] = y;
1358 for (i = 3; i < N; i++){ # loop 2A
1359 x = q[i]; # DR_MISALIGNMENT(q) = 0
1360 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1363 -- Possibility 3: combination of loop peeling and versioning:
1364 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1365 x = q[i];
1366 p[i] = y;
1368 if (p is aligned) {
1369 for (i = 3; i<N; i++){ # loop 3A
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = 0
1374 else {
1375 for (i = 3; i<N; i++){ # loop 3B
1376 x = q[i]; # DR_MISALIGNMENT(q) = 0
1377 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1381 These loops are later passed to loop_transform to be vectorized. The
1382 vectorizer will use the alignment information to guide the transformation
1383 (whether to generate regular loads/stores, or with special handling for
1384 misalignment). */
1386 bool
1387 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1389 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1390 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1391 enum dr_alignment_support supportable_dr_alignment;
1392 struct data_reference *dr0 = NULL, *first_store = NULL;
1393 struct data_reference *dr;
1394 unsigned int i, j;
1395 bool do_peeling = false;
1396 bool do_versioning = false;
1397 bool stat;
1398 gimple *stmt;
1399 stmt_vec_info stmt_info;
1400 unsigned int npeel = 0;
1401 bool all_misalignments_unknown = true;
1402 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1403 unsigned possible_npeel_number = 1;
1404 tree vectype;
1405 unsigned int nelements, mis, same_align_drs_max = 0;
1406 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1407 hash_table<peel_info_hasher> peeling_htab (1);
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "=== vect_enhance_data_refs_alignment ===\n");
1413 /* Reset data so we can safely be called multiple times. */
1414 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1415 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1417 /* While cost model enhancements are expected in the future, the high level
1418 view of the code at this time is as follows:
1420 A) If there is a misaligned access then see if peeling to align
1421 this access can make all data references satisfy
1422 vect_supportable_dr_alignment. If so, update data structures
1423 as needed and return true.
1425 B) If peeling wasn't possible and there is a data reference with an
1426 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1427 then see if loop versioning checks can be used to make all data
1428 references satisfy vect_supportable_dr_alignment. If so, update
1429 data structures as needed and return true.
1431 C) If neither peeling nor versioning were successful then return false if
1432 any data reference does not satisfy vect_supportable_dr_alignment.
1434 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1436 Note, Possibility 3 above (which is peeling and versioning together) is not
1437 being done at this time. */
1439 /* (1) Peeling to force alignment. */
1441 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1442 Considerations:
1443 + How many accesses will become aligned due to the peeling
1444 - How many accesses will become unaligned due to the peeling,
1445 and the cost of misaligned accesses.
1446 - The cost of peeling (the extra runtime checks, the increase
1447 in code size). */
1449 FOR_EACH_VEC_ELT (datarefs, i, dr)
1451 stmt = DR_STMT (dr);
1452 stmt_info = vinfo_for_stmt (stmt);
1454 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1455 continue;
1457 /* For interleaving, only the alignment of the first access
1458 matters. */
1459 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1460 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1461 continue;
1463 /* For invariant accesses there is nothing to enhance. */
1464 if (integer_zerop (DR_STEP (dr)))
1465 continue;
1467 /* Strided accesses perform only component accesses, alignment is
1468 irrelevant for them. */
1469 if (STMT_VINFO_STRIDED_P (stmt_info)
1470 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1471 continue;
1473 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1474 do_peeling = vector_alignment_reachable_p (dr);
1475 if (do_peeling)
1477 if (known_alignment_for_access_p (dr))
1479 unsigned int npeel_tmp;
1480 bool negative = tree_int_cst_compare (DR_STEP (dr),
1481 size_zero_node) < 0;
1483 /* Save info about DR in the hash table. */
1484 vectype = STMT_VINFO_VECTYPE (stmt_info);
1485 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1486 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1487 TREE_TYPE (DR_REF (dr))));
1488 npeel_tmp = (negative
1489 ? (mis - nelements) : (nelements - mis))
1490 & (nelements - 1);
1492 /* For multiple types, it is possible that the bigger type access
1493 will have more than one peeling option. E.g., a loop with two
1494 types: one of size (vector size / 4), and the other one of
1495 size (vector size / 8). Vectorization factor will 8. If both
1496 access are misaligned by 3, the first one needs one scalar
1497 iteration to be aligned, and the second one needs 5. But the
1498 first one will be aligned also by peeling 5 scalar
1499 iterations, and in that case both accesses will be aligned.
1500 Hence, except for the immediate peeling amount, we also want
1501 to try to add full vector size, while we don't exceed
1502 vectorization factor.
1503 We do this automtically for cost model, since we calculate cost
1504 for every peeling option. */
1505 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1507 if (STMT_SLP_TYPE (stmt_info))
1508 possible_npeel_number
1509 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1510 else
1511 possible_npeel_number = vf / nelements;
1514 /* Handle the aligned case. We may decide to align some other
1515 access, making DR unaligned. */
1516 if (DR_MISALIGNMENT (dr) == 0)
1518 npeel_tmp = 0;
1519 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1520 possible_npeel_number++;
1523 for (j = 0; j < possible_npeel_number; j++)
1525 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1526 dr, npeel_tmp);
1527 npeel_tmp += nelements;
1530 all_misalignments_unknown = false;
1531 /* Data-ref that was chosen for the case that all the
1532 misalignments are unknown is not relevant anymore, since we
1533 have a data-ref with known alignment. */
1534 dr0 = NULL;
1536 else
1538 /* If we don't know any misalignment values, we prefer
1539 peeling for data-ref that has the maximum number of data-refs
1540 with the same alignment, unless the target prefers to align
1541 stores over load. */
1542 if (all_misalignments_unknown)
1544 unsigned same_align_drs
1545 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1546 if (!dr0
1547 || same_align_drs_max < same_align_drs)
1549 same_align_drs_max = same_align_drs;
1550 dr0 = dr;
1552 /* For data-refs with the same number of related
1553 accesses prefer the one where the misalign
1554 computation will be invariant in the outermost loop. */
1555 else if (same_align_drs_max == same_align_drs)
1557 struct loop *ivloop0, *ivloop;
1558 ivloop0 = outermost_invariant_loop_for_expr
1559 (loop, DR_BASE_ADDRESS (dr0));
1560 ivloop = outermost_invariant_loop_for_expr
1561 (loop, DR_BASE_ADDRESS (dr));
1562 if ((ivloop && !ivloop0)
1563 || (ivloop && ivloop0
1564 && flow_loop_nested_p (ivloop, ivloop0)))
1565 dr0 = dr;
1568 if (!first_store && DR_IS_WRITE (dr))
1569 first_store = dr;
1572 /* If there are both known and unknown misaligned accesses in the
1573 loop, we choose peeling amount according to the known
1574 accesses. */
1575 if (!supportable_dr_alignment)
1577 dr0 = dr;
1578 if (!first_store && DR_IS_WRITE (dr))
1579 first_store = dr;
1583 else
1585 if (!aligned_access_p (dr))
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589 "vector alignment may not be reachable\n");
1590 break;
1595 /* Check if we can possibly peel the loop. */
1596 if (!vect_can_advance_ivs_p (loop_vinfo)
1597 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1598 || loop->inner)
1599 do_peeling = false;
1601 if (do_peeling
1602 && all_misalignments_unknown
1603 && vect_supportable_dr_alignment (dr0, false))
1605 /* Check if the target requires to prefer stores over loads, i.e., if
1606 misaligned stores are more expensive than misaligned loads (taking
1607 drs with same alignment into account). */
1608 if (first_store && DR_IS_READ (dr0))
1610 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1611 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1612 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1613 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1614 stmt_vector_for_cost dummy;
1615 dummy.create (2);
1617 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1618 &dummy);
1619 vect_get_data_access_cost (first_store, &store_inside_cost,
1620 &store_outside_cost, &dummy);
1622 dummy.release ();
1624 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1625 aligning the load DR0). */
1626 load_inside_penalty = store_inside_cost;
1627 load_outside_penalty = store_outside_cost;
1628 for (i = 0;
1629 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1630 DR_STMT (first_store))).iterate (i, &dr);
1631 i++)
1632 if (DR_IS_READ (dr))
1634 load_inside_penalty += load_inside_cost;
1635 load_outside_penalty += load_outside_cost;
1637 else
1639 load_inside_penalty += store_inside_cost;
1640 load_outside_penalty += store_outside_cost;
1643 /* Calculate the penalty for leaving DR0 unaligned (by
1644 aligning the FIRST_STORE). */
1645 store_inside_penalty = load_inside_cost;
1646 store_outside_penalty = load_outside_cost;
1647 for (i = 0;
1648 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1649 DR_STMT (dr0))).iterate (i, &dr);
1650 i++)
1651 if (DR_IS_READ (dr))
1653 store_inside_penalty += load_inside_cost;
1654 store_outside_penalty += load_outside_cost;
1656 else
1658 store_inside_penalty += store_inside_cost;
1659 store_outside_penalty += store_outside_cost;
1662 if (load_inside_penalty > store_inside_penalty
1663 || (load_inside_penalty == store_inside_penalty
1664 && load_outside_penalty > store_outside_penalty))
1665 dr0 = first_store;
1668 /* In case there are only loads with different unknown misalignments, use
1669 peeling only if it may help to align other accesses in the loop or
1670 if it may help improving load bandwith when we'd end up using
1671 unaligned loads. */
1672 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1673 if (!first_store
1674 && !STMT_VINFO_SAME_ALIGN_REFS (
1675 vinfo_for_stmt (DR_STMT (dr0))).length ()
1676 && (vect_supportable_dr_alignment (dr0, false)
1677 != dr_unaligned_supported
1678 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1679 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1680 do_peeling = false;
1683 if (do_peeling && !dr0)
1685 /* Peeling is possible, but there is no data access that is not supported
1686 unless aligned. So we try to choose the best possible peeling. */
1688 /* We should get here only if there are drs with known misalignment. */
1689 gcc_assert (!all_misalignments_unknown);
1691 /* Choose the best peeling from the hash table. */
1692 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1693 loop_vinfo, &npeel,
1694 &body_cost_vec);
1695 if (!dr0 || !npeel)
1696 do_peeling = false;
1699 if (do_peeling)
1701 stmt = DR_STMT (dr0);
1702 stmt_info = vinfo_for_stmt (stmt);
1703 vectype = STMT_VINFO_VECTYPE (stmt_info);
1704 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1706 if (known_alignment_for_access_p (dr0))
1708 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1709 size_zero_node) < 0;
1710 if (!npeel)
1712 /* Since it's known at compile time, compute the number of
1713 iterations in the peeled loop (the peeling factor) for use in
1714 updating DR_MISALIGNMENT values. The peeling factor is the
1715 vectorization factor minus the misalignment as an element
1716 count. */
1717 mis = DR_MISALIGNMENT (dr0);
1718 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1719 npeel = ((negative ? mis - nelements : nelements - mis)
1720 & (nelements - 1));
1723 /* For interleaved data access every iteration accesses all the
1724 members of the group, therefore we divide the number of iterations
1725 by the group size. */
1726 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1727 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1728 npeel /= GROUP_SIZE (stmt_info);
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location,
1732 "Try peeling by %d\n", npeel);
1735 /* Ensure that all data refs can be vectorized after the peel. */
1736 FOR_EACH_VEC_ELT (datarefs, i, dr)
1738 int save_misalignment;
1740 if (dr == dr0)
1741 continue;
1743 stmt = DR_STMT (dr);
1744 stmt_info = vinfo_for_stmt (stmt);
1745 /* For interleaving, only the alignment of the first access
1746 matters. */
1747 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1748 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1749 continue;
1751 /* Strided accesses perform only component accesses, alignment is
1752 irrelevant for them. */
1753 if (STMT_VINFO_STRIDED_P (stmt_info)
1754 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755 continue;
1757 save_misalignment = DR_MISALIGNMENT (dr);
1758 vect_update_misalignment_for_peel (dr, dr0, npeel);
1759 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1760 SET_DR_MISALIGNMENT (dr, save_misalignment);
1762 if (!supportable_dr_alignment)
1764 do_peeling = false;
1765 break;
1769 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1771 stat = vect_verify_datarefs_alignment (loop_vinfo);
1772 if (!stat)
1773 do_peeling = false;
1774 else
1776 body_cost_vec.release ();
1777 return stat;
1781 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1782 if (do_peeling)
1784 unsigned max_allowed_peel
1785 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1786 if (max_allowed_peel != (unsigned)-1)
1788 unsigned max_peel = npeel;
1789 if (max_peel == 0)
1791 gimple *dr_stmt = DR_STMT (dr0);
1792 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1793 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1794 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1796 if (max_peel > max_allowed_peel)
1798 do_peeling = false;
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_NOTE, vect_location,
1801 "Disable peeling, max peels reached: %d\n", max_peel);
1806 /* Cost model #2 - if peeling may result in a remaining loop not
1807 iterating enough to be vectorized then do not peel. */
1808 if (do_peeling
1809 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1811 unsigned max_peel
1812 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1813 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1814 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1815 do_peeling = false;
1818 if (do_peeling)
1820 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1821 If the misalignment of DR_i is identical to that of dr0 then set
1822 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1823 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1824 by the peeling factor times the element size of DR_i (MOD the
1825 vectorization factor times the size). Otherwise, the
1826 misalignment of DR_i must be set to unknown. */
1827 FOR_EACH_VEC_ELT (datarefs, i, dr)
1828 if (dr != dr0)
1830 /* Strided accesses perform only component accesses, alignment
1831 is irrelevant for them. */
1832 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1833 if (STMT_VINFO_STRIDED_P (stmt_info)
1834 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1835 continue;
1837 vect_update_misalignment_for_peel (dr, dr0, npeel);
1840 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1841 if (npeel)
1842 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1843 else
1844 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1845 = DR_MISALIGNMENT (dr0);
1846 SET_DR_MISALIGNMENT (dr0, 0);
1847 if (dump_enabled_p ())
1849 dump_printf_loc (MSG_NOTE, vect_location,
1850 "Alignment of access forced using peeling.\n");
1851 dump_printf_loc (MSG_NOTE, vect_location,
1852 "Peeling for alignment will be applied.\n");
1854 /* The inside-loop cost will be accounted for in vectorizable_load
1855 and vectorizable_store correctly with adjusted alignments.
1856 Drop the body_cst_vec on the floor here. */
1857 body_cost_vec.release ();
1859 stat = vect_verify_datarefs_alignment (loop_vinfo);
1860 gcc_assert (stat);
1861 return stat;
1865 body_cost_vec.release ();
1867 /* (2) Versioning to force alignment. */
1869 /* Try versioning if:
1870 1) optimize loop for speed
1871 2) there is at least one unsupported misaligned data ref with an unknown
1872 misalignment, and
1873 3) all misaligned data refs with a known misalignment are supported, and
1874 4) the number of runtime alignment checks is within reason. */
1876 do_versioning =
1877 optimize_loop_nest_for_speed_p (loop)
1878 && (!loop->inner); /* FORNOW */
1880 if (do_versioning)
1882 FOR_EACH_VEC_ELT (datarefs, i, dr)
1884 stmt = DR_STMT (dr);
1885 stmt_info = vinfo_for_stmt (stmt);
1887 /* For interleaving, only the alignment of the first access
1888 matters. */
1889 if (aligned_access_p (dr)
1890 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1891 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1892 continue;
1894 if (STMT_VINFO_STRIDED_P (stmt_info))
1896 /* Strided loads perform only component accesses, alignment is
1897 irrelevant for them. */
1898 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1899 continue;
1900 do_versioning = false;
1901 break;
1904 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1906 if (!supportable_dr_alignment)
1908 gimple *stmt;
1909 int mask;
1910 tree vectype;
1912 if (known_alignment_for_access_p (dr)
1913 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1914 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1916 do_versioning = false;
1917 break;
1920 stmt = DR_STMT (dr);
1921 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1922 gcc_assert (vectype);
1924 /* The rightmost bits of an aligned address must be zeros.
1925 Construct the mask needed for this test. For example,
1926 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1927 mask must be 15 = 0xf. */
1928 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1930 /* FORNOW: use the same mask to test all potentially unaligned
1931 references in the loop. The vectorizer currently supports
1932 a single vector size, see the reference to
1933 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1934 vectorization factor is computed. */
1935 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1936 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1937 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1938 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1939 DR_STMT (dr));
1943 /* Versioning requires at least one misaligned data reference. */
1944 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1945 do_versioning = false;
1946 else if (!do_versioning)
1947 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1950 if (do_versioning)
1952 vec<gimple *> may_misalign_stmts
1953 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1954 gimple *stmt;
1956 /* It can now be assumed that the data references in the statements
1957 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1958 of the loop being vectorized. */
1959 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1961 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1962 dr = STMT_VINFO_DATA_REF (stmt_info);
1963 SET_DR_MISALIGNMENT (dr, 0);
1964 if (dump_enabled_p ())
1965 dump_printf_loc (MSG_NOTE, vect_location,
1966 "Alignment of access forced using versioning.\n");
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_NOTE, vect_location,
1971 "Versioning for alignment will be applied.\n");
1973 /* Peeling and versioning can't be done together at this time. */
1974 gcc_assert (! (do_peeling && do_versioning));
1976 stat = vect_verify_datarefs_alignment (loop_vinfo);
1977 gcc_assert (stat);
1978 return stat;
1981 /* This point is reached if neither peeling nor versioning is being done. */
1982 gcc_assert (! (do_peeling || do_versioning));
1984 stat = vect_verify_datarefs_alignment (loop_vinfo);
1985 return stat;
1989 /* Function vect_find_same_alignment_drs.
1991 Update group and alignment relations according to the chosen
1992 vectorization factor. */
1994 static void
1995 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1996 loop_vec_info loop_vinfo)
1998 unsigned int i;
1999 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2000 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001 struct data_reference *dra = DDR_A (ddr);
2002 struct data_reference *drb = DDR_B (ddr);
2003 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2004 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2005 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2006 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2007 lambda_vector dist_v;
2008 unsigned int loop_depth;
2010 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2011 return;
2013 if (dra == drb)
2014 return;
2016 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2017 return;
2019 /* Loop-based vectorization and known data dependence. */
2020 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2021 return;
2023 /* Data-dependence analysis reports a distance vector of zero
2024 for data-references that overlap only in the first iteration
2025 but have different sign step (see PR45764).
2026 So as a sanity check require equal DR_STEP. */
2027 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2028 return;
2030 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2031 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2033 int dist = dist_v[loop_depth];
2035 if (dump_enabled_p ())
2036 dump_printf_loc (MSG_NOTE, vect_location,
2037 "dependence distance = %d.\n", dist);
2039 /* Same loop iteration. */
2040 if (dist == 0
2041 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2043 /* Two references with distance zero have the same alignment. */
2044 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2045 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2046 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "accesses have the same alignment.\n");
2050 dump_printf (MSG_NOTE,
2051 "dependence distance modulo vf == 0 between ");
2052 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2053 dump_printf (MSG_NOTE, " and ");
2054 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2055 dump_printf (MSG_NOTE, "\n");
2062 /* Function vect_analyze_data_refs_alignment
2064 Analyze the alignment of the data-references in the loop.
2065 Return FALSE if a data reference is found that cannot be vectorized. */
2067 bool
2068 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2070 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_NOTE, vect_location,
2072 "=== vect_analyze_data_refs_alignment ===\n");
2074 /* Mark groups of data references with same alignment using
2075 data dependence information. */
2076 vec<ddr_p> ddrs = vinfo->ddrs;
2077 struct data_dependence_relation *ddr;
2078 unsigned int i;
2080 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2081 vect_find_same_alignment_drs (ddr, vinfo);
2083 vec<data_reference_p> datarefs = vinfo->datarefs;
2084 struct data_reference *dr;
2086 FOR_EACH_VEC_ELT (datarefs, i, dr)
2088 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2089 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2090 && !vect_compute_data_ref_alignment (dr))
2092 /* Strided accesses perform only component accesses, misalignment
2093 information is irrelevant for them. */
2094 if (STMT_VINFO_STRIDED_P (stmt_info)
2095 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2096 continue;
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 "not vectorized: can't calculate alignment "
2101 "for data ref.\n");
2103 return false;
2107 return true;
2111 /* Analyze alignment of DRs of stmts in NODE. */
2113 static bool
2114 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2116 /* We vectorize from the first scalar stmt in the node unless
2117 the node is permuted in which case we start from the first
2118 element in the group. */
2119 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2120 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2121 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2122 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2124 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2125 if (! vect_compute_data_ref_alignment (dr)
2126 /* For creating the data-ref pointer we need alignment of the
2127 first element anyway. */
2128 || (dr != first_dr
2129 && ! vect_compute_data_ref_alignment (first_dr))
2130 || ! verify_data_ref_alignment (dr))
2132 if (dump_enabled_p ())
2133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2134 "not vectorized: bad data alignment in basic "
2135 "block.\n");
2136 return false;
2139 return true;
2142 /* Function vect_slp_analyze_instance_alignment
2144 Analyze the alignment of the data-references in the SLP instance.
2145 Return FALSE if a data reference is found that cannot be vectorized. */
2147 bool
2148 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_NOTE, vect_location,
2152 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2154 slp_tree node;
2155 unsigned i;
2156 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2157 if (! vect_slp_analyze_and_verify_node_alignment (node))
2158 return false;
2160 node = SLP_INSTANCE_TREE (instance);
2161 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2162 && ! vect_slp_analyze_and_verify_node_alignment
2163 (SLP_INSTANCE_TREE (instance)))
2164 return false;
2166 return true;
2170 /* Analyze groups of accesses: check that DR belongs to a group of
2171 accesses of legal size, step, etc. Detect gaps, single element
2172 interleaving, and other special cases. Set grouped access info.
2173 Collect groups of strided stores for further use in SLP analysis.
2174 Worker for vect_analyze_group_access. */
2176 static bool
2177 vect_analyze_group_access_1 (struct data_reference *dr)
2179 tree step = DR_STEP (dr);
2180 tree scalar_type = TREE_TYPE (DR_REF (dr));
2181 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2182 gimple *stmt = DR_STMT (dr);
2183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2184 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2185 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2186 HOST_WIDE_INT dr_step = -1;
2187 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2188 bool slp_impossible = false;
2190 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2191 size of the interleaving group (including gaps). */
2192 if (tree_fits_shwi_p (step))
2194 dr_step = tree_to_shwi (step);
2195 /* Check that STEP is a multiple of type size. Otherwise there is
2196 a non-element-sized gap at the end of the group which we
2197 cannot represent in GROUP_GAP or GROUP_SIZE.
2198 ??? As we can handle non-constant step fine here we should
2199 simply remove uses of GROUP_GAP between the last and first
2200 element and instead rely on DR_STEP. GROUP_SIZE then would
2201 simply not include that gap. */
2202 if ((dr_step % type_size) != 0)
2204 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_NOTE, vect_location,
2207 "Step ");
2208 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2209 dump_printf (MSG_NOTE,
2210 " is not a multiple of the element size for ");
2211 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2212 dump_printf (MSG_NOTE, "\n");
2214 return false;
2216 groupsize = absu_hwi (dr_step) / type_size;
2218 else
2219 groupsize = 0;
2221 /* Not consecutive access is possible only if it is a part of interleaving. */
2222 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2224 /* Check if it this DR is a part of interleaving, and is a single
2225 element of the group that is accessed in the loop. */
2227 /* Gaps are supported only for loads. STEP must be a multiple of the type
2228 size. The size of the group must be a power of 2. */
2229 if (DR_IS_READ (dr)
2230 && (dr_step % type_size) == 0
2231 && groupsize > 0
2232 && exact_log2 (groupsize) != -1)
2234 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2235 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2236 GROUP_GAP (stmt_info) = groupsize - 1;
2237 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_NOTE, vect_location,
2240 "Detected single element interleaving ");
2241 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2242 dump_printf (MSG_NOTE, " step ");
2243 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2244 dump_printf (MSG_NOTE, "\n");
2247 return true;
2250 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253 "not consecutive access ");
2254 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2257 if (bb_vinfo)
2259 /* Mark the statement as unvectorizable. */
2260 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2261 return true;
2264 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2265 STMT_VINFO_STRIDED_P (stmt_info) = true;
2266 return true;
2269 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2271 /* First stmt in the interleaving chain. Check the chain. */
2272 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2273 struct data_reference *data_ref = dr;
2274 unsigned int count = 1;
2275 tree prev_init = DR_INIT (data_ref);
2276 gimple *prev = stmt;
2277 HOST_WIDE_INT diff, gaps = 0;
2279 while (next)
2281 /* Skip same data-refs. In case that two or more stmts share
2282 data-ref (supported only for loads), we vectorize only the first
2283 stmt, and the rest get their vectorized loads from the first
2284 one. */
2285 if (!tree_int_cst_compare (DR_INIT (data_ref),
2286 DR_INIT (STMT_VINFO_DATA_REF (
2287 vinfo_for_stmt (next)))))
2289 if (DR_IS_WRITE (data_ref))
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2293 "Two store stmts share the same dr.\n");
2294 return false;
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299 "Two or more load stmts share the same dr.\n");
2301 /* For load use the same data-ref load. */
2302 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2304 prev = next;
2305 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2306 continue;
2309 prev = next;
2310 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2312 /* All group members have the same STEP by construction. */
2313 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2315 /* Check that the distance between two accesses is equal to the type
2316 size. Otherwise, we have gaps. */
2317 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2318 - TREE_INT_CST_LOW (prev_init)) / type_size;
2319 if (diff != 1)
2321 /* FORNOW: SLP of accesses with gaps is not supported. */
2322 slp_impossible = true;
2323 if (DR_IS_WRITE (data_ref))
2325 if (dump_enabled_p ())
2326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2327 "interleaved store with gaps\n");
2328 return false;
2331 gaps += diff - 1;
2334 last_accessed_element += diff;
2336 /* Store the gap from the previous member of the group. If there is no
2337 gap in the access, GROUP_GAP is always 1. */
2338 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2340 prev_init = DR_INIT (data_ref);
2341 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2342 /* Count the number of data-refs in the chain. */
2343 count++;
2346 if (groupsize == 0)
2347 groupsize = count + gaps;
2349 if (groupsize > UINT_MAX)
2351 if (dump_enabled_p ())
2352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2353 "group is too large\n");
2354 return false;
2357 /* Check that the size of the interleaving is equal to count for stores,
2358 i.e., that there are no gaps. */
2359 if (groupsize != count
2360 && !DR_IS_READ (dr))
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364 "interleaved store with gaps\n");
2365 return false;
2368 /* If there is a gap after the last load in the group it is the
2369 difference between the groupsize and the last accessed
2370 element.
2371 When there is no gap, this difference should be 0. */
2372 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2374 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2375 if (dump_enabled_p ())
2377 dump_printf_loc (MSG_NOTE, vect_location,
2378 "Detected interleaving ");
2379 if (DR_IS_READ (dr))
2380 dump_printf (MSG_NOTE, "load ");
2381 else
2382 dump_printf (MSG_NOTE, "store ");
2383 dump_printf (MSG_NOTE, "of size %u starting with ",
2384 (unsigned)groupsize);
2385 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2386 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2387 dump_printf_loc (MSG_NOTE, vect_location,
2388 "There is a gap of %u elements after the group\n",
2389 GROUP_GAP (vinfo_for_stmt (stmt)));
2392 /* SLP: create an SLP data structure for every interleaving group of
2393 stores for further analysis in vect_analyse_slp. */
2394 if (DR_IS_WRITE (dr) && !slp_impossible)
2396 if (loop_vinfo)
2397 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2398 if (bb_vinfo)
2399 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2403 return true;
2406 /* Analyze groups of accesses: check that DR belongs to a group of
2407 accesses of legal size, step, etc. Detect gaps, single element
2408 interleaving, and other special cases. Set grouped access info.
2409 Collect groups of strided stores for further use in SLP analysis. */
2411 static bool
2412 vect_analyze_group_access (struct data_reference *dr)
2414 if (!vect_analyze_group_access_1 (dr))
2416 /* Dissolve the group if present. */
2417 gimple *next;
2418 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2419 while (stmt)
2421 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2422 next = GROUP_NEXT_ELEMENT (vinfo);
2423 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2424 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2425 stmt = next;
2427 return false;
2429 return true;
2432 /* Analyze the access pattern of the data-reference DR.
2433 In case of non-consecutive accesses call vect_analyze_group_access() to
2434 analyze groups of accesses. */
2436 static bool
2437 vect_analyze_data_ref_access (struct data_reference *dr)
2439 tree step = DR_STEP (dr);
2440 tree scalar_type = TREE_TYPE (DR_REF (dr));
2441 gimple *stmt = DR_STMT (dr);
2442 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2443 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2444 struct loop *loop = NULL;
2446 if (loop_vinfo)
2447 loop = LOOP_VINFO_LOOP (loop_vinfo);
2449 if (loop_vinfo && !step)
2451 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453 "bad data-ref access in loop\n");
2454 return false;
2457 /* Allow loads with zero step in inner-loop vectorization. */
2458 if (loop_vinfo && integer_zerop (step))
2460 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2461 if (!nested_in_vect_loop_p (loop, stmt))
2462 return DR_IS_READ (dr);
2463 /* Allow references with zero step for outer loops marked
2464 with pragma omp simd only - it guarantees absence of
2465 loop-carried dependencies between inner loop iterations. */
2466 if (!loop->force_vectorize)
2468 if (dump_enabled_p ())
2469 dump_printf_loc (MSG_NOTE, vect_location,
2470 "zero step in inner loop of nest\n");
2471 return false;
2475 if (loop && nested_in_vect_loop_p (loop, stmt))
2477 /* Interleaved accesses are not yet supported within outer-loop
2478 vectorization for references in the inner-loop. */
2479 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2481 /* For the rest of the analysis we use the outer-loop step. */
2482 step = STMT_VINFO_DR_STEP (stmt_info);
2483 if (integer_zerop (step))
2485 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_NOTE, vect_location,
2487 "zero step in outer loop.\n");
2488 return DR_IS_READ (dr);
2492 /* Consecutive? */
2493 if (TREE_CODE (step) == INTEGER_CST)
2495 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2496 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2497 || (dr_step < 0
2498 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2500 /* Mark that it is not interleaving. */
2501 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2502 return true;
2506 if (loop && nested_in_vect_loop_p (loop, stmt))
2508 if (dump_enabled_p ())
2509 dump_printf_loc (MSG_NOTE, vect_location,
2510 "grouped access in outer loop.\n");
2511 return false;
2515 /* Assume this is a DR handled by non-constant strided load case. */
2516 if (TREE_CODE (step) != INTEGER_CST)
2517 return (STMT_VINFO_STRIDED_P (stmt_info)
2518 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2519 || vect_analyze_group_access (dr)));
2521 /* Not consecutive access - check if it's a part of interleaving group. */
2522 return vect_analyze_group_access (dr);
2527 /* A helper function used in the comparator function to sort data
2528 references. T1 and T2 are two data references to be compared.
2529 The function returns -1, 0, or 1. */
2531 static int
2532 compare_tree (tree t1, tree t2)
2534 int i, cmp;
2535 enum tree_code code;
2536 char tclass;
2538 if (t1 == t2)
2539 return 0;
2540 if (t1 == NULL)
2541 return -1;
2542 if (t2 == NULL)
2543 return 1;
2545 STRIP_NOPS (t1);
2546 STRIP_NOPS (t2);
2548 if (TREE_CODE (t1) != TREE_CODE (t2))
2549 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2551 code = TREE_CODE (t1);
2552 switch (code)
2554 /* For const values, we can just use hash values for comparisons. */
2555 case INTEGER_CST:
2556 case REAL_CST:
2557 case FIXED_CST:
2558 case STRING_CST:
2559 case COMPLEX_CST:
2560 case VECTOR_CST:
2562 hashval_t h1 = iterative_hash_expr (t1, 0);
2563 hashval_t h2 = iterative_hash_expr (t2, 0);
2564 if (h1 != h2)
2565 return h1 < h2 ? -1 : 1;
2566 break;
2569 case SSA_NAME:
2570 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2571 if (cmp != 0)
2572 return cmp;
2574 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2575 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2576 break;
2578 default:
2579 tclass = TREE_CODE_CLASS (code);
2581 /* For var-decl, we could compare their UIDs. */
2582 if (tclass == tcc_declaration)
2584 if (DECL_UID (t1) != DECL_UID (t2))
2585 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2586 break;
2589 /* For expressions with operands, compare their operands recursively. */
2590 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2592 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2593 if (cmp != 0)
2594 return cmp;
2598 return 0;
2602 /* Compare two data-references DRA and DRB to group them into chunks
2603 suitable for grouping. */
2605 static int
2606 dr_group_sort_cmp (const void *dra_, const void *drb_)
2608 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2609 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2610 int cmp;
2612 /* Stabilize sort. */
2613 if (dra == drb)
2614 return 0;
2616 /* DRs in different loops never belong to the same group. */
2617 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2618 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2619 if (loopa != loopb)
2620 return loopa->num < loopb->num ? -1 : 1;
2622 /* Ordering of DRs according to base. */
2623 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2625 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2626 if (cmp != 0)
2627 return cmp;
2630 /* And according to DR_OFFSET. */
2631 if (!dr_equal_offsets_p (dra, drb))
2633 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2634 if (cmp != 0)
2635 return cmp;
2638 /* Put reads before writes. */
2639 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2640 return DR_IS_READ (dra) ? -1 : 1;
2642 /* Then sort after access size. */
2643 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2644 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2646 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2647 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2648 if (cmp != 0)
2649 return cmp;
2652 /* And after step. */
2653 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2655 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2656 if (cmp != 0)
2657 return cmp;
2660 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2661 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2662 if (cmp == 0)
2663 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2664 return cmp;
2667 /* Function vect_analyze_data_ref_accesses.
2669 Analyze the access pattern of all the data references in the loop.
2671 FORNOW: the only access pattern that is considered vectorizable is a
2672 simple step 1 (consecutive) access.
2674 FORNOW: handle only arrays and pointer accesses. */
2676 bool
2677 vect_analyze_data_ref_accesses (vec_info *vinfo)
2679 unsigned int i;
2680 vec<data_reference_p> datarefs = vinfo->datarefs;
2681 struct data_reference *dr;
2683 if (dump_enabled_p ())
2684 dump_printf_loc (MSG_NOTE, vect_location,
2685 "=== vect_analyze_data_ref_accesses ===\n");
2687 if (datarefs.is_empty ())
2688 return true;
2690 /* Sort the array of datarefs to make building the interleaving chains
2691 linear. Don't modify the original vector's order, it is needed for
2692 determining what dependencies are reversed. */
2693 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2694 datarefs_copy.qsort (dr_group_sort_cmp);
2696 /* Build the interleaving chains. */
2697 for (i = 0; i < datarefs_copy.length () - 1;)
2699 data_reference_p dra = datarefs_copy[i];
2700 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2701 stmt_vec_info lastinfo = NULL;
2702 for (i = i + 1; i < datarefs_copy.length (); ++i)
2704 data_reference_p drb = datarefs_copy[i];
2705 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2707 /* ??? Imperfect sorting (non-compatible types, non-modulo
2708 accesses, same accesses) can lead to a group to be artificially
2709 split here as we don't just skip over those. If it really
2710 matters we can push those to a worklist and re-iterate
2711 over them. The we can just skip ahead to the next DR here. */
2713 /* DRs in a different loop should not be put into the same
2714 interleaving group. */
2715 if (gimple_bb (DR_STMT (dra))->loop_father
2716 != gimple_bb (DR_STMT (drb))->loop_father)
2717 break;
2719 /* Check that the data-refs have same first location (except init)
2720 and they are both either store or load (not load and store,
2721 not masked loads or stores). */
2722 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2723 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2724 DR_BASE_ADDRESS (drb), 0)
2725 || !dr_equal_offsets_p (dra, drb)
2726 || !gimple_assign_single_p (DR_STMT (dra))
2727 || !gimple_assign_single_p (DR_STMT (drb)))
2728 break;
2730 /* Check that the data-refs have the same constant size. */
2731 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2732 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2733 if (!tree_fits_uhwi_p (sza)
2734 || !tree_fits_uhwi_p (szb)
2735 || !tree_int_cst_equal (sza, szb))
2736 break;
2738 /* Check that the data-refs have the same step. */
2739 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2740 break;
2742 /* Do not place the same access in the interleaving chain twice. */
2743 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2744 break;
2746 /* Check the types are compatible.
2747 ??? We don't distinguish this during sorting. */
2748 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2749 TREE_TYPE (DR_REF (drb))))
2750 break;
2752 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2753 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2754 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2755 gcc_assert (init_a < init_b);
2757 /* If init_b == init_a + the size of the type * k, we have an
2758 interleaving, and DRA is accessed before DRB. */
2759 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2760 if (type_size_a == 0
2761 || (init_b - init_a) % type_size_a != 0)
2762 break;
2764 /* If we have a store, the accesses are adjacent. This splits
2765 groups into chunks we support (we don't support vectorization
2766 of stores with gaps). */
2767 if (!DR_IS_READ (dra)
2768 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2769 (DR_INIT (datarefs_copy[i-1]))
2770 != type_size_a))
2771 break;
2773 /* If the step (if not zero or non-constant) is greater than the
2774 difference between data-refs' inits this splits groups into
2775 suitable sizes. */
2776 if (tree_fits_shwi_p (DR_STEP (dra)))
2778 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2779 if (step != 0 && step <= (init_b - init_a))
2780 break;
2783 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_NOTE, vect_location,
2786 "Detected interleaving ");
2787 if (DR_IS_READ (dra))
2788 dump_printf (MSG_NOTE, "load ");
2789 else
2790 dump_printf (MSG_NOTE, "store ");
2791 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2792 dump_printf (MSG_NOTE, " and ");
2793 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2794 dump_printf (MSG_NOTE, "\n");
2797 /* Link the found element into the group list. */
2798 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2800 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2801 lastinfo = stmtinfo_a;
2803 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2804 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2805 lastinfo = stmtinfo_b;
2809 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2810 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2811 && !vect_analyze_data_ref_access (dr))
2813 if (dump_enabled_p ())
2814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2815 "not vectorized: complicated access pattern.\n");
2817 if (is_a <bb_vec_info> (vinfo))
2819 /* Mark the statement as not vectorizable. */
2820 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2821 continue;
2823 else
2825 datarefs_copy.release ();
2826 return false;
2830 datarefs_copy.release ();
2831 return true;
2835 /* Operator == between two dr_with_seg_len objects.
2837 This equality operator is used to make sure two data refs
2838 are the same one so that we will consider to combine the
2839 aliasing checks of those two pairs of data dependent data
2840 refs. */
2842 static bool
2843 operator == (const dr_with_seg_len& d1,
2844 const dr_with_seg_len& d2)
2846 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2847 DR_BASE_ADDRESS (d2.dr), 0)
2848 && compare_tree (d1.offset, d2.offset) == 0
2849 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2852 /* Function comp_dr_with_seg_len_pair.
2854 Comparison function for sorting objects of dr_with_seg_len_pair_t
2855 so that we can combine aliasing checks in one scan. */
2857 static int
2858 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2860 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2861 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2863 const dr_with_seg_len &p11 = p1->first,
2864 &p12 = p1->second,
2865 &p21 = p2->first,
2866 &p22 = p2->second;
2868 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2869 if a and c have the same basic address snd step, and b and d have the same
2870 address and step. Therefore, if any a&c or b&d don't have the same address
2871 and step, we don't care the order of those two pairs after sorting. */
2872 int comp_res;
2874 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2875 DR_BASE_ADDRESS (p21.dr))) != 0)
2876 return comp_res;
2877 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2878 DR_BASE_ADDRESS (p22.dr))) != 0)
2879 return comp_res;
2880 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2881 return comp_res;
2882 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2883 return comp_res;
2884 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2885 return comp_res;
2886 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2887 return comp_res;
2889 return 0;
2892 /* Function vect_vfa_segment_size.
2894 Create an expression that computes the size of segment
2895 that will be accessed for a data reference. The functions takes into
2896 account that realignment loads may access one more vector.
2898 Input:
2899 DR: The data reference.
2900 LENGTH_FACTOR: segment length to consider.
2902 Return an expression whose value is the size of segment which will be
2903 accessed by DR. */
2905 static tree
2906 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2908 tree segment_length;
2910 if (integer_zerop (DR_STEP (dr)))
2911 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2912 else
2913 segment_length = size_binop (MULT_EXPR,
2914 fold_convert (sizetype, DR_STEP (dr)),
2915 fold_convert (sizetype, length_factor));
2917 if (vect_supportable_dr_alignment (dr, false)
2918 == dr_explicit_realign_optimized)
2920 tree vector_size = TYPE_SIZE_UNIT
2921 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2923 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2925 return segment_length;
2928 /* Function vect_prune_runtime_alias_test_list.
2930 Prune a list of ddrs to be tested at run-time by versioning for alias.
2931 Merge several alias checks into one if possible.
2932 Return FALSE if resulting list of ddrs is longer then allowed by
2933 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2935 bool
2936 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2938 vec<ddr_p> may_alias_ddrs =
2939 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2940 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2941 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2942 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2943 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2945 ddr_p ddr;
2946 unsigned int i;
2947 tree length_factor;
2949 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_NOTE, vect_location,
2951 "=== vect_prune_runtime_alias_test_list ===\n");
2953 if (may_alias_ddrs.is_empty ())
2954 return true;
2956 /* Basically, for each pair of dependent data refs store_ptr_0
2957 and load_ptr_0, we create an expression:
2959 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2960 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2962 for aliasing checks. However, in some cases we can decrease
2963 the number of checks by combining two checks into one. For
2964 example, suppose we have another pair of data refs store_ptr_0
2965 and load_ptr_1, and if the following condition is satisfied:
2967 load_ptr_0 < load_ptr_1 &&
2968 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2970 (this condition means, in each iteration of vectorized loop,
2971 the accessed memory of store_ptr_0 cannot be between the memory
2972 of load_ptr_0 and load_ptr_1.)
2974 we then can use only the following expression to finish the
2975 alising checks between store_ptr_0 & load_ptr_0 and
2976 store_ptr_0 & load_ptr_1:
2978 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2979 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2981 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2982 same basic address. */
2984 comp_alias_ddrs.create (may_alias_ddrs.length ());
2986 /* First, we collect all data ref pairs for aliasing checks. */
2987 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2989 struct data_reference *dr_a, *dr_b;
2990 gimple *dr_group_first_a, *dr_group_first_b;
2991 tree segment_length_a, segment_length_b;
2992 gimple *stmt_a, *stmt_b;
2994 dr_a = DDR_A (ddr);
2995 stmt_a = DR_STMT (DDR_A (ddr));
2996 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2997 if (dr_group_first_a)
2999 stmt_a = dr_group_first_a;
3000 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3003 dr_b = DDR_B (ddr);
3004 stmt_b = DR_STMT (DDR_B (ddr));
3005 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3006 if (dr_group_first_b)
3008 stmt_b = dr_group_first_b;
3009 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3012 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3013 length_factor = scalar_loop_iters;
3014 else
3015 length_factor = size_int (vect_factor);
3016 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3017 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3019 dr_with_seg_len_pair_t dr_with_seg_len_pair
3020 (dr_with_seg_len (dr_a, segment_length_a),
3021 dr_with_seg_len (dr_b, segment_length_b));
3023 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3024 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3026 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3029 /* Second, we sort the collected data ref pairs so that we can scan
3030 them once to combine all possible aliasing checks. */
3031 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3033 /* Third, we scan the sorted dr pairs and check if we can combine
3034 alias checks of two neighboring dr pairs. */
3035 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3037 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3038 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3039 *dr_b1 = &comp_alias_ddrs[i-1].second,
3040 *dr_a2 = &comp_alias_ddrs[i].first,
3041 *dr_b2 = &comp_alias_ddrs[i].second;
3043 /* Remove duplicate data ref pairs. */
3044 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3046 if (dump_enabled_p ())
3048 dump_printf_loc (MSG_NOTE, vect_location,
3049 "found equal ranges ");
3050 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3051 DR_REF (dr_a1->dr));
3052 dump_printf (MSG_NOTE, ", ");
3053 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3054 DR_REF (dr_b1->dr));
3055 dump_printf (MSG_NOTE, " and ");
3056 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3057 DR_REF (dr_a2->dr));
3058 dump_printf (MSG_NOTE, ", ");
3059 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3060 DR_REF (dr_b2->dr));
3061 dump_printf (MSG_NOTE, "\n");
3064 comp_alias_ddrs.ordered_remove (i--);
3065 continue;
3068 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3070 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3071 and DR_A1 and DR_A2 are two consecutive memrefs. */
3072 if (*dr_a1 == *dr_a2)
3074 std::swap (dr_a1, dr_b1);
3075 std::swap (dr_a2, dr_b2);
3078 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3079 DR_BASE_ADDRESS (dr_a2->dr),
3081 || !tree_fits_shwi_p (dr_a1->offset)
3082 || !tree_fits_shwi_p (dr_a2->offset))
3083 continue;
3085 /* Make sure dr_a1 starts left of dr_a2. */
3086 if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset))
3087 std::swap (*dr_a1, *dr_a2);
3089 unsigned HOST_WIDE_INT diff
3090 = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset);
3093 bool do_remove = false;
3095 /* If the left segment does not extend beyond the start of the
3096 right segment the new segment length is that of the right
3097 plus the segment distance. */
3098 if (tree_fits_uhwi_p (dr_a1->seg_len)
3099 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3101 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3102 size_int (diff));
3103 do_remove = true;
3105 /* Generally the new segment length is the maximum of the
3106 left segment size and the right segment size plus the distance.
3107 ??? We can also build tree MAX_EXPR here but it's not clear this
3108 is profitable. */
3109 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3110 && tree_fits_uhwi_p (dr_a2->seg_len))
3112 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3113 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3114 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3115 do_remove = true;
3117 /* Now we check if the following condition is satisfied:
3119 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3121 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3122 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3123 have to make a best estimation. We can get the minimum value
3124 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3125 then either of the following two conditions can guarantee the
3126 one above:
3128 1: DIFF <= MIN_SEG_LEN_B
3129 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3130 else
3132 unsigned HOST_WIDE_INT min_seg_len_b
3133 = (tree_fits_uhwi_p (dr_b1->seg_len)
3134 ? tree_to_uhwi (dr_b1->seg_len)
3135 : vect_factor);
3137 if (diff <= min_seg_len_b
3138 || (tree_fits_uhwi_p (dr_a1->seg_len)
3139 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3141 dr_a1->seg_len = size_binop (PLUS_EXPR,
3142 dr_a2->seg_len, size_int (diff));
3143 do_remove = true;
3147 if (do_remove)
3149 if (dump_enabled_p ())
3151 dump_printf_loc (MSG_NOTE, vect_location,
3152 "merging ranges for ");
3153 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3154 dump_printf (MSG_NOTE, ", ");
3155 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3156 dump_printf (MSG_NOTE, " and ");
3157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3158 dump_printf (MSG_NOTE, ", ");
3159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3160 dump_printf (MSG_NOTE, "\n");
3162 comp_alias_ddrs.ordered_remove (i--);
3167 dump_printf_loc (MSG_NOTE, vect_location,
3168 "improved number of alias checks from %d to %d\n",
3169 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3170 if ((int) comp_alias_ddrs.length () >
3171 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3172 return false;
3174 return true;
3177 /* Check whether a non-affine read or write in stmt is suitable for gather load
3178 or scatter store and if so, return a builtin decl for that operation. */
3180 tree
3181 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3182 tree *offp, int *scalep)
3184 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3187 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3188 tree offtype = NULL_TREE;
3189 tree decl, base, off;
3190 machine_mode pmode;
3191 int punsignedp, reversep, pvolatilep = 0;
3193 base = DR_REF (dr);
3194 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3195 see if we can use the def stmt of the address. */
3196 if (is_gimple_call (stmt)
3197 && gimple_call_internal_p (stmt)
3198 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3199 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3200 && TREE_CODE (base) == MEM_REF
3201 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3202 && integer_zerop (TREE_OPERAND (base, 1))
3203 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3205 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3206 if (is_gimple_assign (def_stmt)
3207 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3208 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3211 /* The gather and scatter builtins need address of the form
3212 loop_invariant + vector * {1, 2, 4, 8}
3214 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3215 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3216 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3217 multiplications and additions in it. To get a vector, we need
3218 a single SSA_NAME that will be defined in the loop and will
3219 contain everything that is not loop invariant and that can be
3220 vectorized. The following code attempts to find such a preexistng
3221 SSA_NAME OFF and put the loop invariants into a tree BASE
3222 that can be gimplified before the loop. */
3223 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3224 &punsignedp, &reversep, &pvolatilep, false);
3225 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3227 if (TREE_CODE (base) == MEM_REF)
3229 if (!integer_zerop (TREE_OPERAND (base, 1)))
3231 if (off == NULL_TREE)
3233 offset_int moff = mem_ref_offset (base);
3234 off = wide_int_to_tree (sizetype, moff);
3236 else
3237 off = size_binop (PLUS_EXPR, off,
3238 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3240 base = TREE_OPERAND (base, 0);
3242 else
3243 base = build_fold_addr_expr (base);
3245 if (off == NULL_TREE)
3246 off = size_zero_node;
3248 /* If base is not loop invariant, either off is 0, then we start with just
3249 the constant offset in the loop invariant BASE and continue with base
3250 as OFF, otherwise give up.
3251 We could handle that case by gimplifying the addition of base + off
3252 into some SSA_NAME and use that as off, but for now punt. */
3253 if (!expr_invariant_in_loop_p (loop, base))
3255 if (!integer_zerop (off))
3256 return NULL_TREE;
3257 off = base;
3258 base = size_int (pbitpos / BITS_PER_UNIT);
3260 /* Otherwise put base + constant offset into the loop invariant BASE
3261 and continue with OFF. */
3262 else
3264 base = fold_convert (sizetype, base);
3265 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3268 /* OFF at this point may be either a SSA_NAME or some tree expression
3269 from get_inner_reference. Try to peel off loop invariants from it
3270 into BASE as long as possible. */
3271 STRIP_NOPS (off);
3272 while (offtype == NULL_TREE)
3274 enum tree_code code;
3275 tree op0, op1, add = NULL_TREE;
3277 if (TREE_CODE (off) == SSA_NAME)
3279 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3281 if (expr_invariant_in_loop_p (loop, off))
3282 return NULL_TREE;
3284 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3285 break;
3287 op0 = gimple_assign_rhs1 (def_stmt);
3288 code = gimple_assign_rhs_code (def_stmt);
3289 op1 = gimple_assign_rhs2 (def_stmt);
3291 else
3293 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3294 return NULL_TREE;
3295 code = TREE_CODE (off);
3296 extract_ops_from_tree (off, &code, &op0, &op1);
3298 switch (code)
3300 case POINTER_PLUS_EXPR:
3301 case PLUS_EXPR:
3302 if (expr_invariant_in_loop_p (loop, op0))
3304 add = op0;
3305 off = op1;
3306 do_add:
3307 add = fold_convert (sizetype, add);
3308 if (scale != 1)
3309 add = size_binop (MULT_EXPR, add, size_int (scale));
3310 base = size_binop (PLUS_EXPR, base, add);
3311 continue;
3313 if (expr_invariant_in_loop_p (loop, op1))
3315 add = op1;
3316 off = op0;
3317 goto do_add;
3319 break;
3320 case MINUS_EXPR:
3321 if (expr_invariant_in_loop_p (loop, op1))
3323 add = fold_convert (sizetype, op1);
3324 add = size_binop (MINUS_EXPR, size_zero_node, add);
3325 off = op0;
3326 goto do_add;
3328 break;
3329 case MULT_EXPR:
3330 if (scale == 1 && tree_fits_shwi_p (op1))
3332 scale = tree_to_shwi (op1);
3333 off = op0;
3334 continue;
3336 break;
3337 case SSA_NAME:
3338 off = op0;
3339 continue;
3340 CASE_CONVERT:
3341 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3342 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3343 break;
3344 if (TYPE_PRECISION (TREE_TYPE (op0))
3345 == TYPE_PRECISION (TREE_TYPE (off)))
3347 off = op0;
3348 continue;
3350 if (TYPE_PRECISION (TREE_TYPE (op0))
3351 < TYPE_PRECISION (TREE_TYPE (off)))
3353 off = op0;
3354 offtype = TREE_TYPE (off);
3355 STRIP_NOPS (off);
3356 continue;
3358 break;
3359 default:
3360 break;
3362 break;
3365 /* If at the end OFF still isn't a SSA_NAME or isn't
3366 defined in the loop, punt. */
3367 if (TREE_CODE (off) != SSA_NAME
3368 || expr_invariant_in_loop_p (loop, off))
3369 return NULL_TREE;
3371 if (offtype == NULL_TREE)
3372 offtype = TREE_TYPE (off);
3374 if (DR_IS_READ (dr))
3375 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3376 offtype, scale);
3377 else
3378 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3379 offtype, scale);
3381 if (decl == NULL_TREE)
3382 return NULL_TREE;
3384 if (basep)
3385 *basep = base;
3386 if (offp)
3387 *offp = off;
3388 if (scalep)
3389 *scalep = scale;
3390 return decl;
3393 /* Function vect_analyze_data_refs.
3395 Find all the data references in the loop or basic block.
3397 The general structure of the analysis of data refs in the vectorizer is as
3398 follows:
3399 1- vect_analyze_data_refs(loop/bb): call
3400 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3401 in the loop/bb and their dependences.
3402 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3403 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3404 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3408 bool
3409 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3411 struct loop *loop = NULL;
3412 unsigned int i;
3413 struct data_reference *dr;
3414 tree scalar_type;
3416 if (dump_enabled_p ())
3417 dump_printf_loc (MSG_NOTE, vect_location,
3418 "=== vect_analyze_data_refs ===\n");
3420 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3421 loop = LOOP_VINFO_LOOP (loop_vinfo);
3423 /* Go through the data-refs, check that the analysis succeeded. Update
3424 pointer from stmt_vec_info struct to DR and vectype. */
3426 vec<data_reference_p> datarefs = vinfo->datarefs;
3427 FOR_EACH_VEC_ELT (datarefs, i, dr)
3429 gimple *stmt;
3430 stmt_vec_info stmt_info;
3431 tree base, offset, init;
3432 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3433 bool simd_lane_access = false;
3434 int vf;
3436 again:
3437 if (!dr || !DR_REF (dr))
3439 if (dump_enabled_p ())
3440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3441 "not vectorized: unhandled data-ref\n");
3442 return false;
3445 stmt = DR_STMT (dr);
3446 stmt_info = vinfo_for_stmt (stmt);
3448 /* Discard clobbers from the dataref vector. We will remove
3449 clobber stmts during vectorization. */
3450 if (gimple_clobber_p (stmt))
3452 free_data_ref (dr);
3453 if (i == datarefs.length () - 1)
3455 datarefs.pop ();
3456 break;
3458 datarefs.ordered_remove (i);
3459 dr = datarefs[i];
3460 goto again;
3463 /* Check that analysis of the data-ref succeeded. */
3464 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3465 || !DR_STEP (dr))
3467 bool maybe_gather
3468 = DR_IS_READ (dr)
3469 && !TREE_THIS_VOLATILE (DR_REF (dr))
3470 && targetm.vectorize.builtin_gather != NULL;
3471 bool maybe_scatter
3472 = DR_IS_WRITE (dr)
3473 && !TREE_THIS_VOLATILE (DR_REF (dr))
3474 && targetm.vectorize.builtin_scatter != NULL;
3475 bool maybe_simd_lane_access
3476 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3478 /* If target supports vector gather loads or scatter stores, or if
3479 this might be a SIMD lane access, see if they can't be used. */
3480 if (is_a <loop_vec_info> (vinfo)
3481 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3482 && !nested_in_vect_loop_p (loop, stmt))
3484 struct data_reference *newdr
3485 = create_data_ref (NULL, loop_containing_stmt (stmt),
3486 DR_REF (dr), stmt, maybe_scatter ? false : true);
3487 gcc_assert (newdr != NULL && DR_REF (newdr));
3488 if (DR_BASE_ADDRESS (newdr)
3489 && DR_OFFSET (newdr)
3490 && DR_INIT (newdr)
3491 && DR_STEP (newdr)
3492 && integer_zerop (DR_STEP (newdr)))
3494 if (maybe_simd_lane_access)
3496 tree off = DR_OFFSET (newdr);
3497 STRIP_NOPS (off);
3498 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3499 && TREE_CODE (off) == MULT_EXPR
3500 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3502 tree step = TREE_OPERAND (off, 1);
3503 off = TREE_OPERAND (off, 0);
3504 STRIP_NOPS (off);
3505 if (CONVERT_EXPR_P (off)
3506 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3507 0)))
3508 < TYPE_PRECISION (TREE_TYPE (off)))
3509 off = TREE_OPERAND (off, 0);
3510 if (TREE_CODE (off) == SSA_NAME)
3512 gimple *def = SSA_NAME_DEF_STMT (off);
3513 tree reft = TREE_TYPE (DR_REF (newdr));
3514 if (is_gimple_call (def)
3515 && gimple_call_internal_p (def)
3516 && (gimple_call_internal_fn (def)
3517 == IFN_GOMP_SIMD_LANE))
3519 tree arg = gimple_call_arg (def, 0);
3520 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3521 arg = SSA_NAME_VAR (arg);
3522 if (arg == loop->simduid
3523 /* For now. */
3524 && tree_int_cst_equal
3525 (TYPE_SIZE_UNIT (reft),
3526 step))
3528 DR_OFFSET (newdr) = ssize_int (0);
3529 DR_STEP (newdr) = step;
3530 DR_ALIGNED_TO (newdr)
3531 = size_int (BIGGEST_ALIGNMENT);
3532 dr = newdr;
3533 simd_lane_access = true;
3539 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3541 dr = newdr;
3542 if (maybe_gather)
3543 gatherscatter = GATHER;
3544 else
3545 gatherscatter = SCATTER;
3548 if (gatherscatter == SG_NONE && !simd_lane_access)
3549 free_data_ref (newdr);
3552 if (gatherscatter == SG_NONE && !simd_lane_access)
3554 if (dump_enabled_p ())
3556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3557 "not vectorized: data ref analysis "
3558 "failed ");
3559 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3560 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3563 if (is_a <bb_vec_info> (vinfo))
3564 break;
3566 return false;
3570 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3574 "not vectorized: base addr of dr is a "
3575 "constant\n");
3577 if (is_a <bb_vec_info> (vinfo))
3578 break;
3580 if (gatherscatter != SG_NONE || simd_lane_access)
3581 free_data_ref (dr);
3582 return false;
3585 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3587 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 "not vectorized: volatile type ");
3591 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3592 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3595 if (is_a <bb_vec_info> (vinfo))
3596 break;
3598 return false;
3601 if (stmt_can_throw_internal (stmt))
3603 if (dump_enabled_p ())
3605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3606 "not vectorized: statement can throw an "
3607 "exception ");
3608 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3609 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3612 if (is_a <bb_vec_info> (vinfo))
3613 break;
3615 if (gatherscatter != SG_NONE || simd_lane_access)
3616 free_data_ref (dr);
3617 return false;
3620 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3621 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3623 if (dump_enabled_p ())
3625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3626 "not vectorized: statement is bitfield "
3627 "access ");
3628 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3629 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3632 if (is_a <bb_vec_info> (vinfo))
3633 break;
3635 if (gatherscatter != SG_NONE || simd_lane_access)
3636 free_data_ref (dr);
3637 return false;
3640 base = unshare_expr (DR_BASE_ADDRESS (dr));
3641 offset = unshare_expr (DR_OFFSET (dr));
3642 init = unshare_expr (DR_INIT (dr));
3644 if (is_gimple_call (stmt)
3645 && (!gimple_call_internal_p (stmt)
3646 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3647 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3649 if (dump_enabled_p ())
3651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3652 "not vectorized: dr in a call ");
3653 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3654 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3657 if (is_a <bb_vec_info> (vinfo))
3658 break;
3660 if (gatherscatter != SG_NONE || simd_lane_access)
3661 free_data_ref (dr);
3662 return false;
3665 /* Update DR field in stmt_vec_info struct. */
3667 /* If the dataref is in an inner-loop of the loop that is considered for
3668 for vectorization, we also want to analyze the access relative to
3669 the outer-loop (DR contains information only relative to the
3670 inner-most enclosing loop). We do that by building a reference to the
3671 first location accessed by the inner-loop, and analyze it relative to
3672 the outer-loop. */
3673 if (loop && nested_in_vect_loop_p (loop, stmt))
3675 tree outer_step, outer_base, outer_init;
3676 HOST_WIDE_INT pbitsize, pbitpos;
3677 tree poffset;
3678 machine_mode pmode;
3679 int punsignedp, preversep, pvolatilep;
3680 affine_iv base_iv, offset_iv;
3681 tree dinit;
3683 /* Build a reference to the first location accessed by the
3684 inner-loop: *(BASE+INIT). (The first location is actually
3685 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3686 tree inner_base = build_fold_indirect_ref
3687 (fold_build_pointer_plus (base, init));
3689 if (dump_enabled_p ())
3691 dump_printf_loc (MSG_NOTE, vect_location,
3692 "analyze in outer-loop: ");
3693 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3694 dump_printf (MSG_NOTE, "\n");
3697 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3698 &poffset, &pmode, &punsignedp,
3699 &preversep, &pvolatilep, false);
3700 gcc_assert (outer_base != NULL_TREE);
3702 if (pbitpos % BITS_PER_UNIT != 0)
3704 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706 "failed: bit offset alignment.\n");
3707 return false;
3710 if (preversep)
3712 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3714 "failed: reverse storage order.\n");
3715 return false;
3718 outer_base = build_fold_addr_expr (outer_base);
3719 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3720 &base_iv, false))
3722 if (dump_enabled_p ())
3723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3724 "failed: evolution of base is not affine.\n");
3725 return false;
3728 if (offset)
3730 if (poffset)
3731 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3732 poffset);
3733 else
3734 poffset = offset;
3737 if (!poffset)
3739 offset_iv.base = ssize_int (0);
3740 offset_iv.step = ssize_int (0);
3742 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3743 &offset_iv, false))
3745 if (dump_enabled_p ())
3746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3747 "evolution of offset is not affine.\n");
3748 return false;
3751 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3752 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3753 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3754 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3755 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3757 outer_step = size_binop (PLUS_EXPR,
3758 fold_convert (ssizetype, base_iv.step),
3759 fold_convert (ssizetype, offset_iv.step));
3761 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3762 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3763 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3764 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3765 STMT_VINFO_DR_OFFSET (stmt_info) =
3766 fold_convert (ssizetype, offset_iv.base);
3767 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3768 size_int (highest_pow2_factor (offset_iv.base));
3770 if (dump_enabled_p ())
3772 dump_printf_loc (MSG_NOTE, vect_location,
3773 "\touter base_address: ");
3774 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3775 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3776 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3777 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3778 STMT_VINFO_DR_OFFSET (stmt_info));
3779 dump_printf (MSG_NOTE,
3780 "\n\touter constant offset from base address: ");
3781 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3782 STMT_VINFO_DR_INIT (stmt_info));
3783 dump_printf (MSG_NOTE, "\n\touter step: ");
3784 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3785 STMT_VINFO_DR_STEP (stmt_info));
3786 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3787 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3788 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3789 dump_printf (MSG_NOTE, "\n");
3793 if (STMT_VINFO_DATA_REF (stmt_info))
3795 if (dump_enabled_p ())
3797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3798 "not vectorized: more than one data ref "
3799 "in stmt: ");
3800 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3801 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3804 if (is_a <bb_vec_info> (vinfo))
3805 break;
3807 if (gatherscatter != SG_NONE || simd_lane_access)
3808 free_data_ref (dr);
3809 return false;
3812 STMT_VINFO_DATA_REF (stmt_info) = dr;
3813 if (simd_lane_access)
3815 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3816 free_data_ref (datarefs[i]);
3817 datarefs[i] = dr;
3820 /* Set vectype for STMT. */
3821 scalar_type = TREE_TYPE (DR_REF (dr));
3822 STMT_VINFO_VECTYPE (stmt_info)
3823 = get_vectype_for_scalar_type (scalar_type);
3824 if (!STMT_VINFO_VECTYPE (stmt_info))
3826 if (dump_enabled_p ())
3828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3829 "not vectorized: no vectype for stmt: ");
3830 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3831 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3832 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3833 scalar_type);
3834 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3837 if (is_a <bb_vec_info> (vinfo))
3839 /* No vector type is fine, the ref can still participate
3840 in dependence analysis, we just can't vectorize it. */
3841 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3842 continue;
3845 if (gatherscatter != SG_NONE || simd_lane_access)
3847 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3848 if (gatherscatter != SG_NONE)
3849 free_data_ref (dr);
3851 return false;
3853 else
3855 if (dump_enabled_p ())
3857 dump_printf_loc (MSG_NOTE, vect_location,
3858 "got vectype for stmt: ");
3859 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3860 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3861 STMT_VINFO_VECTYPE (stmt_info));
3862 dump_printf (MSG_NOTE, "\n");
3866 /* Adjust the minimal vectorization factor according to the
3867 vector type. */
3868 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3869 if (vf > *min_vf)
3870 *min_vf = vf;
3872 if (gatherscatter != SG_NONE)
3874 tree off;
3875 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3876 NULL, &off, NULL)
3877 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3879 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3880 free_data_ref (dr);
3881 if (dump_enabled_p ())
3883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3884 (gatherscatter == GATHER) ?
3885 "not vectorized: not suitable for gather "
3886 "load " :
3887 "not vectorized: not suitable for scatter "
3888 "store ");
3889 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3890 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3892 return false;
3895 free_data_ref (datarefs[i]);
3896 datarefs[i] = dr;
3897 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3900 else if (is_a <loop_vec_info> (vinfo)
3901 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3903 if (nested_in_vect_loop_p (loop, stmt))
3905 if (dump_enabled_p ())
3907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3908 "not vectorized: not suitable for strided "
3909 "load ");
3910 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3911 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3913 return false;
3915 STMT_VINFO_STRIDED_P (stmt_info) = true;
3919 /* If we stopped analysis at the first dataref we could not analyze
3920 when trying to vectorize a basic-block mark the rest of the datarefs
3921 as not vectorizable and truncate the vector of datarefs. That
3922 avoids spending useless time in analyzing their dependence. */
3923 if (i != datarefs.length ())
3925 gcc_assert (is_a <bb_vec_info> (vinfo));
3926 for (unsigned j = i; j < datarefs.length (); ++j)
3928 data_reference_p dr = datarefs[j];
3929 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3930 free_data_ref (dr);
3932 datarefs.truncate (i);
3935 return true;
3939 /* Function vect_get_new_vect_var.
3941 Returns a name for a new variable. The current naming scheme appends the
3942 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3943 the name of vectorizer generated variables, and appends that to NAME if
3944 provided. */
3946 tree
3947 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3949 const char *prefix;
3950 tree new_vect_var;
3952 switch (var_kind)
3954 case vect_simple_var:
3955 prefix = "vect";
3956 break;
3957 case vect_scalar_var:
3958 prefix = "stmp";
3959 break;
3960 case vect_mask_var:
3961 prefix = "mask";
3962 break;
3963 case vect_pointer_var:
3964 prefix = "vectp";
3965 break;
3966 default:
3967 gcc_unreachable ();
3970 if (name)
3972 char* tmp = concat (prefix, "_", name, NULL);
3973 new_vect_var = create_tmp_reg (type, tmp);
3974 free (tmp);
3976 else
3977 new_vect_var = create_tmp_reg (type, prefix);
3979 return new_vect_var;
3982 /* Like vect_get_new_vect_var but return an SSA name. */
3984 tree
3985 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3987 const char *prefix;
3988 tree new_vect_var;
3990 switch (var_kind)
3992 case vect_simple_var:
3993 prefix = "vect";
3994 break;
3995 case vect_scalar_var:
3996 prefix = "stmp";
3997 break;
3998 case vect_pointer_var:
3999 prefix = "vectp";
4000 break;
4001 default:
4002 gcc_unreachable ();
4005 if (name)
4007 char* tmp = concat (prefix, "_", name, NULL);
4008 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4009 free (tmp);
4011 else
4012 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4014 return new_vect_var;
4017 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4019 static void
4020 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4021 stmt_vec_info stmt_info)
4023 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4024 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4025 int misalign = DR_MISALIGNMENT (dr);
4026 if (misalign == -1)
4027 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4028 else
4029 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4032 /* Function vect_create_addr_base_for_vector_ref.
4034 Create an expression that computes the address of the first memory location
4035 that will be accessed for a data reference.
4037 Input:
4038 STMT: The statement containing the data reference.
4039 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4040 OFFSET: Optional. If supplied, it is be added to the initial address.
4041 LOOP: Specify relative to which loop-nest should the address be computed.
4042 For example, when the dataref is in an inner-loop nested in an
4043 outer-loop that is now being vectorized, LOOP can be either the
4044 outer-loop, or the inner-loop. The first memory location accessed
4045 by the following dataref ('in' points to short):
4047 for (i=0; i<N; i++)
4048 for (j=0; j<M; j++)
4049 s += in[i+j]
4051 is as follows:
4052 if LOOP=i_loop: &in (relative to i_loop)
4053 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4054 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4055 initial address. Unlike OFFSET, which is number of elements to
4056 be added, BYTE_OFFSET is measured in bytes.
4058 Output:
4059 1. Return an SSA_NAME whose value is the address of the memory location of
4060 the first vector of the data reference.
4061 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4062 these statement(s) which define the returned SSA_NAME.
4064 FORNOW: We are only handling array accesses with step 1. */
4066 tree
4067 vect_create_addr_base_for_vector_ref (gimple *stmt,
4068 gimple_seq *new_stmt_list,
4069 tree offset,
4070 struct loop *loop,
4071 tree byte_offset)
4073 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4074 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4075 tree data_ref_base;
4076 const char *base_name;
4077 tree addr_base;
4078 tree dest;
4079 gimple_seq seq = NULL;
4080 tree base_offset;
4081 tree init;
4082 tree vect_ptr_type;
4083 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4084 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4086 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4088 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4090 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4092 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4093 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4094 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4096 else
4098 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4099 base_offset = unshare_expr (DR_OFFSET (dr));
4100 init = unshare_expr (DR_INIT (dr));
4103 if (loop_vinfo)
4104 base_name = get_name (data_ref_base);
4105 else
4107 base_offset = ssize_int (0);
4108 init = ssize_int (0);
4109 base_name = get_name (DR_REF (dr));
4112 /* Create base_offset */
4113 base_offset = size_binop (PLUS_EXPR,
4114 fold_convert (sizetype, base_offset),
4115 fold_convert (sizetype, init));
4117 if (offset)
4119 offset = fold_build2 (MULT_EXPR, sizetype,
4120 fold_convert (sizetype, offset), step);
4121 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4122 base_offset, offset);
4124 if (byte_offset)
4126 byte_offset = fold_convert (sizetype, byte_offset);
4127 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4128 base_offset, byte_offset);
4131 /* base + base_offset */
4132 if (loop_vinfo)
4133 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4134 else
4136 addr_base = build1 (ADDR_EXPR,
4137 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4138 unshare_expr (DR_REF (dr)));
4141 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4142 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4143 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4144 gimple_seq_add_seq (new_stmt_list, seq);
4146 if (DR_PTR_INFO (dr)
4147 && TREE_CODE (addr_base) == SSA_NAME
4148 && !SSA_NAME_PTR_INFO (addr_base))
4150 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4151 if (offset || byte_offset)
4152 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4155 if (dump_enabled_p ())
4157 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4158 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4159 dump_printf (MSG_NOTE, "\n");
4162 return addr_base;
4166 /* Function vect_create_data_ref_ptr.
4168 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4169 location accessed in the loop by STMT, along with the def-use update
4170 chain to appropriately advance the pointer through the loop iterations.
4171 Also set aliasing information for the pointer. This pointer is used by
4172 the callers to this function to create a memory reference expression for
4173 vector load/store access.
4175 Input:
4176 1. STMT: a stmt that references memory. Expected to be of the form
4177 GIMPLE_ASSIGN <name, data-ref> or
4178 GIMPLE_ASSIGN <data-ref, name>.
4179 2. AGGR_TYPE: the type of the reference, which should be either a vector
4180 or an array.
4181 3. AT_LOOP: the loop where the vector memref is to be created.
4182 4. OFFSET (optional): an offset to be added to the initial address accessed
4183 by the data-ref in STMT.
4184 5. BSI: location where the new stmts are to be placed if there is no loop
4185 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4186 pointing to the initial address.
4187 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4188 to the initial address accessed by the data-ref in STMT. This is
4189 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4190 in bytes.
4192 Output:
4193 1. Declare a new ptr to vector_type, and have it point to the base of the
4194 data reference (initial addressed accessed by the data reference).
4195 For example, for vector of type V8HI, the following code is generated:
4197 v8hi *ap;
4198 ap = (v8hi *)initial_address;
4200 if OFFSET is not supplied:
4201 initial_address = &a[init];
4202 if OFFSET is supplied:
4203 initial_address = &a[init + OFFSET];
4204 if BYTE_OFFSET is supplied:
4205 initial_address = &a[init] + BYTE_OFFSET;
4207 Return the initial_address in INITIAL_ADDRESS.
4209 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4210 update the pointer in each iteration of the loop.
4212 Return the increment stmt that updates the pointer in PTR_INCR.
4214 3. Set INV_P to true if the access pattern of the data reference in the
4215 vectorized loop is invariant. Set it to false otherwise.
4217 4. Return the pointer. */
4219 tree
4220 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4221 tree offset, tree *initial_address,
4222 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4223 bool only_init, bool *inv_p, tree byte_offset)
4225 const char *base_name;
4226 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4227 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4228 struct loop *loop = NULL;
4229 bool nested_in_vect_loop = false;
4230 struct loop *containing_loop = NULL;
4231 tree aggr_ptr_type;
4232 tree aggr_ptr;
4233 tree new_temp;
4234 gimple_seq new_stmt_list = NULL;
4235 edge pe = NULL;
4236 basic_block new_bb;
4237 tree aggr_ptr_init;
4238 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4239 tree aptr;
4240 gimple_stmt_iterator incr_gsi;
4241 bool insert_after;
4242 tree indx_before_incr, indx_after_incr;
4243 gimple *incr;
4244 tree step;
4245 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4247 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4248 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4250 if (loop_vinfo)
4252 loop = LOOP_VINFO_LOOP (loop_vinfo);
4253 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4254 containing_loop = (gimple_bb (stmt))->loop_father;
4255 pe = loop_preheader_edge (loop);
4257 else
4259 gcc_assert (bb_vinfo);
4260 only_init = true;
4261 *ptr_incr = NULL;
4264 /* Check the step (evolution) of the load in LOOP, and record
4265 whether it's invariant. */
4266 if (nested_in_vect_loop)
4267 step = STMT_VINFO_DR_STEP (stmt_info);
4268 else
4269 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4271 if (integer_zerop (step))
4272 *inv_p = true;
4273 else
4274 *inv_p = false;
4276 /* Create an expression for the first address accessed by this load
4277 in LOOP. */
4278 base_name = get_name (DR_BASE_ADDRESS (dr));
4280 if (dump_enabled_p ())
4282 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4283 dump_printf_loc (MSG_NOTE, vect_location,
4284 "create %s-pointer variable to type: ",
4285 get_tree_code_name (TREE_CODE (aggr_type)));
4286 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4287 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4288 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4289 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4290 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4291 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4292 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4293 else
4294 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4295 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4296 dump_printf (MSG_NOTE, "\n");
4299 /* (1) Create the new aggregate-pointer variable.
4300 Vector and array types inherit the alias set of their component
4301 type by default so we need to use a ref-all pointer if the data
4302 reference does not conflict with the created aggregated data
4303 reference because it is not addressable. */
4304 bool need_ref_all = false;
4305 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4306 get_alias_set (DR_REF (dr))))
4307 need_ref_all = true;
4308 /* Likewise for any of the data references in the stmt group. */
4309 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4311 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4314 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4315 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4316 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4317 get_alias_set (DR_REF (sdr))))
4319 need_ref_all = true;
4320 break;
4322 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4324 while (orig_stmt);
4326 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4327 need_ref_all);
4328 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4331 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4332 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4333 def-use update cycles for the pointer: one relative to the outer-loop
4334 (LOOP), which is what steps (3) and (4) below do. The other is relative
4335 to the inner-loop (which is the inner-most loop containing the dataref),
4336 and this is done be step (5) below.
4338 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4339 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4340 redundant. Steps (3),(4) create the following:
4342 vp0 = &base_addr;
4343 LOOP: vp1 = phi(vp0,vp2)
4346 vp2 = vp1 + step
4347 goto LOOP
4349 If there is an inner-loop nested in loop, then step (5) will also be
4350 applied, and an additional update in the inner-loop will be created:
4352 vp0 = &base_addr;
4353 LOOP: vp1 = phi(vp0,vp2)
4355 inner: vp3 = phi(vp1,vp4)
4356 vp4 = vp3 + inner_step
4357 if () goto inner
4359 vp2 = vp1 + step
4360 if () goto LOOP */
4362 /* (2) Calculate the initial address of the aggregate-pointer, and set
4363 the aggregate-pointer to point to it before the loop. */
4365 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4367 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4368 offset, loop, byte_offset);
4369 if (new_stmt_list)
4371 if (pe)
4373 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4374 gcc_assert (!new_bb);
4376 else
4377 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4380 *initial_address = new_temp;
4381 aggr_ptr_init = new_temp;
4383 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4384 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4385 inner-loop nested in LOOP (during outer-loop vectorization). */
4387 /* No update in loop is required. */
4388 if (only_init && (!loop_vinfo || at_loop == loop))
4389 aptr = aggr_ptr_init;
4390 else
4392 /* The step of the aggregate pointer is the type size. */
4393 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4394 /* One exception to the above is when the scalar step of the load in
4395 LOOP is zero. In this case the step here is also zero. */
4396 if (*inv_p)
4397 iv_step = size_zero_node;
4398 else if (tree_int_cst_sgn (step) == -1)
4399 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4401 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4403 create_iv (aggr_ptr_init,
4404 fold_convert (aggr_ptr_type, iv_step),
4405 aggr_ptr, loop, &incr_gsi, insert_after,
4406 &indx_before_incr, &indx_after_incr);
4407 incr = gsi_stmt (incr_gsi);
4408 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4410 /* Copy the points-to information if it exists. */
4411 if (DR_PTR_INFO (dr))
4413 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4414 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4416 if (ptr_incr)
4417 *ptr_incr = incr;
4419 aptr = indx_before_incr;
4422 if (!nested_in_vect_loop || only_init)
4423 return aptr;
4426 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4427 nested in LOOP, if exists. */
4429 gcc_assert (nested_in_vect_loop);
4430 if (!only_init)
4432 standard_iv_increment_position (containing_loop, &incr_gsi,
4433 &insert_after);
4434 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4435 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4436 &indx_after_incr);
4437 incr = gsi_stmt (incr_gsi);
4438 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4440 /* Copy the points-to information if it exists. */
4441 if (DR_PTR_INFO (dr))
4443 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4444 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4446 if (ptr_incr)
4447 *ptr_incr = incr;
4449 return indx_before_incr;
4451 else
4452 gcc_unreachable ();
4456 /* Function bump_vector_ptr
4458 Increment a pointer (to a vector type) by vector-size. If requested,
4459 i.e. if PTR-INCR is given, then also connect the new increment stmt
4460 to the existing def-use update-chain of the pointer, by modifying
4461 the PTR_INCR as illustrated below:
4463 The pointer def-use update-chain before this function:
4464 DATAREF_PTR = phi (p_0, p_2)
4465 ....
4466 PTR_INCR: p_2 = DATAREF_PTR + step
4468 The pointer def-use update-chain after this function:
4469 DATAREF_PTR = phi (p_0, p_2)
4470 ....
4471 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4472 ....
4473 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4475 Input:
4476 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4477 in the loop.
4478 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4479 the loop. The increment amount across iterations is expected
4480 to be vector_size.
4481 BSI - location where the new update stmt is to be placed.
4482 STMT - the original scalar memory-access stmt that is being vectorized.
4483 BUMP - optional. The offset by which to bump the pointer. If not given,
4484 the offset is assumed to be vector_size.
4486 Output: Return NEW_DATAREF_PTR as illustrated above.
4490 tree
4491 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4492 gimple *stmt, tree bump)
4494 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4495 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4496 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4497 tree update = TYPE_SIZE_UNIT (vectype);
4498 gassign *incr_stmt;
4499 ssa_op_iter iter;
4500 use_operand_p use_p;
4501 tree new_dataref_ptr;
4503 if (bump)
4504 update = bump;
4506 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4507 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4508 else
4509 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4510 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4511 dataref_ptr, update);
4512 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4514 /* Copy the points-to information if it exists. */
4515 if (DR_PTR_INFO (dr))
4517 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4518 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4521 if (!ptr_incr)
4522 return new_dataref_ptr;
4524 /* Update the vector-pointer's cross-iteration increment. */
4525 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4527 tree use = USE_FROM_PTR (use_p);
4529 if (use == dataref_ptr)
4530 SET_USE (use_p, new_dataref_ptr);
4531 else
4532 gcc_assert (tree_int_cst_compare (use, update) == 0);
4535 return new_dataref_ptr;
4539 /* Function vect_create_destination_var.
4541 Create a new temporary of type VECTYPE. */
4543 tree
4544 vect_create_destination_var (tree scalar_dest, tree vectype)
4546 tree vec_dest;
4547 const char *name;
4548 char *new_name;
4549 tree type;
4550 enum vect_var_kind kind;
4552 kind = vectype
4553 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4554 ? vect_mask_var
4555 : vect_simple_var
4556 : vect_scalar_var;
4557 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4559 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4561 name = get_name (scalar_dest);
4562 if (name)
4563 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4564 else
4565 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4566 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4567 free (new_name);
4569 return vec_dest;
4572 /* Function vect_grouped_store_supported.
4574 Returns TRUE if interleave high and interleave low permutations
4575 are supported, and FALSE otherwise. */
4577 bool
4578 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4580 machine_mode mode = TYPE_MODE (vectype);
4582 /* vect_permute_store_chain requires the group size to be equal to 3 or
4583 be a power of two. */
4584 if (count != 3 && exact_log2 (count) == -1)
4586 if (dump_enabled_p ())
4587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4588 "the size of the group of accesses"
4589 " is not a power of 2 or not eqaul to 3\n");
4590 return false;
4593 /* Check that the permutation is supported. */
4594 if (VECTOR_MODE_P (mode))
4596 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4597 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4599 if (count == 3)
4601 unsigned int j0 = 0, j1 = 0, j2 = 0;
4602 unsigned int i, j;
4604 for (j = 0; j < 3; j++)
4606 int nelt0 = ((3 - j) * nelt) % 3;
4607 int nelt1 = ((3 - j) * nelt + 1) % 3;
4608 int nelt2 = ((3 - j) * nelt + 2) % 3;
4609 for (i = 0; i < nelt; i++)
4611 if (3 * i + nelt0 < nelt)
4612 sel[3 * i + nelt0] = j0++;
4613 if (3 * i + nelt1 < nelt)
4614 sel[3 * i + nelt1] = nelt + j1++;
4615 if (3 * i + nelt2 < nelt)
4616 sel[3 * i + nelt2] = 0;
4618 if (!can_vec_perm_p (mode, false, sel))
4620 if (dump_enabled_p ())
4621 dump_printf (MSG_MISSED_OPTIMIZATION,
4622 "permutaion op not supported by target.\n");
4623 return false;
4626 for (i = 0; i < nelt; i++)
4628 if (3 * i + nelt0 < nelt)
4629 sel[3 * i + nelt0] = 3 * i + nelt0;
4630 if (3 * i + nelt1 < nelt)
4631 sel[3 * i + nelt1] = 3 * i + nelt1;
4632 if (3 * i + nelt2 < nelt)
4633 sel[3 * i + nelt2] = nelt + j2++;
4635 if (!can_vec_perm_p (mode, false, sel))
4637 if (dump_enabled_p ())
4638 dump_printf (MSG_MISSED_OPTIMIZATION,
4639 "permutaion op not supported by target.\n");
4640 return false;
4643 return true;
4645 else
4647 /* If length is not equal to 3 then only power of 2 is supported. */
4648 gcc_assert (exact_log2 (count) != -1);
4650 for (i = 0; i < nelt / 2; i++)
4652 sel[i * 2] = i;
4653 sel[i * 2 + 1] = i + nelt;
4655 if (can_vec_perm_p (mode, false, sel))
4657 for (i = 0; i < nelt; i++)
4658 sel[i] += nelt / 2;
4659 if (can_vec_perm_p (mode, false, sel))
4660 return true;
4665 if (dump_enabled_p ())
4666 dump_printf (MSG_MISSED_OPTIMIZATION,
4667 "permutaion op not supported by target.\n");
4668 return false;
4672 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4673 type VECTYPE. */
4675 bool
4676 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4678 return vect_lanes_optab_supported_p ("vec_store_lanes",
4679 vec_store_lanes_optab,
4680 vectype, count);
4684 /* Function vect_permute_store_chain.
4686 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4687 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4688 the data correctly for the stores. Return the final references for stores
4689 in RESULT_CHAIN.
4691 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4692 The input is 4 vectors each containing 8 elements. We assign a number to
4693 each element, the input sequence is:
4695 1st vec: 0 1 2 3 4 5 6 7
4696 2nd vec: 8 9 10 11 12 13 14 15
4697 3rd vec: 16 17 18 19 20 21 22 23
4698 4th vec: 24 25 26 27 28 29 30 31
4700 The output sequence should be:
4702 1st vec: 0 8 16 24 1 9 17 25
4703 2nd vec: 2 10 18 26 3 11 19 27
4704 3rd vec: 4 12 20 28 5 13 21 30
4705 4th vec: 6 14 22 30 7 15 23 31
4707 i.e., we interleave the contents of the four vectors in their order.
4709 We use interleave_high/low instructions to create such output. The input of
4710 each interleave_high/low operation is two vectors:
4711 1st vec 2nd vec
4712 0 1 2 3 4 5 6 7
4713 the even elements of the result vector are obtained left-to-right from the
4714 high/low elements of the first vector. The odd elements of the result are
4715 obtained left-to-right from the high/low elements of the second vector.
4716 The output of interleave_high will be: 0 4 1 5
4717 and of interleave_low: 2 6 3 7
4720 The permutation is done in log LENGTH stages. In each stage interleave_high
4721 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4722 where the first argument is taken from the first half of DR_CHAIN and the
4723 second argument from it's second half.
4724 In our example,
4726 I1: interleave_high (1st vec, 3rd vec)
4727 I2: interleave_low (1st vec, 3rd vec)
4728 I3: interleave_high (2nd vec, 4th vec)
4729 I4: interleave_low (2nd vec, 4th vec)
4731 The output for the first stage is:
4733 I1: 0 16 1 17 2 18 3 19
4734 I2: 4 20 5 21 6 22 7 23
4735 I3: 8 24 9 25 10 26 11 27
4736 I4: 12 28 13 29 14 30 15 31
4738 The output of the second stage, i.e. the final result is:
4740 I1: 0 8 16 24 1 9 17 25
4741 I2: 2 10 18 26 3 11 19 27
4742 I3: 4 12 20 28 5 13 21 30
4743 I4: 6 14 22 30 7 15 23 31. */
4745 void
4746 vect_permute_store_chain (vec<tree> dr_chain,
4747 unsigned int length,
4748 gimple *stmt,
4749 gimple_stmt_iterator *gsi,
4750 vec<tree> *result_chain)
4752 tree vect1, vect2, high, low;
4753 gimple *perm_stmt;
4754 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4755 tree perm_mask_low, perm_mask_high;
4756 tree data_ref;
4757 tree perm3_mask_low, perm3_mask_high;
4758 unsigned int i, n, log_length = exact_log2 (length);
4759 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4760 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4762 result_chain->quick_grow (length);
4763 memcpy (result_chain->address (), dr_chain.address (),
4764 length * sizeof (tree));
4766 if (length == 3)
4768 unsigned int j0 = 0, j1 = 0, j2 = 0;
4770 for (j = 0; j < 3; j++)
4772 int nelt0 = ((3 - j) * nelt) % 3;
4773 int nelt1 = ((3 - j) * nelt + 1) % 3;
4774 int nelt2 = ((3 - j) * nelt + 2) % 3;
4776 for (i = 0; i < nelt; i++)
4778 if (3 * i + nelt0 < nelt)
4779 sel[3 * i + nelt0] = j0++;
4780 if (3 * i + nelt1 < nelt)
4781 sel[3 * i + nelt1] = nelt + j1++;
4782 if (3 * i + nelt2 < nelt)
4783 sel[3 * i + nelt2] = 0;
4785 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4787 for (i = 0; i < nelt; i++)
4789 if (3 * i + nelt0 < nelt)
4790 sel[3 * i + nelt0] = 3 * i + nelt0;
4791 if (3 * i + nelt1 < nelt)
4792 sel[3 * i + nelt1] = 3 * i + nelt1;
4793 if (3 * i + nelt2 < nelt)
4794 sel[3 * i + nelt2] = nelt + j2++;
4796 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4798 vect1 = dr_chain[0];
4799 vect2 = dr_chain[1];
4801 /* Create interleaving stmt:
4802 low = VEC_PERM_EXPR <vect1, vect2,
4803 {j, nelt, *, j + 1, nelt + j + 1, *,
4804 j + 2, nelt + j + 2, *, ...}> */
4805 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4806 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4807 vect2, perm3_mask_low);
4808 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4810 vect1 = data_ref;
4811 vect2 = dr_chain[2];
4812 /* Create interleaving stmt:
4813 low = VEC_PERM_EXPR <vect1, vect2,
4814 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4815 6, 7, nelt + j + 2, ...}> */
4816 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4817 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4818 vect2, perm3_mask_high);
4819 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4820 (*result_chain)[j] = data_ref;
4823 else
4825 /* If length is not equal to 3 then only power of 2 is supported. */
4826 gcc_assert (exact_log2 (length) != -1);
4828 for (i = 0, n = nelt / 2; i < n; i++)
4830 sel[i * 2] = i;
4831 sel[i * 2 + 1] = i + nelt;
4833 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4835 for (i = 0; i < nelt; i++)
4836 sel[i] += nelt / 2;
4837 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4839 for (i = 0, n = log_length; i < n; i++)
4841 for (j = 0; j < length/2; j++)
4843 vect1 = dr_chain[j];
4844 vect2 = dr_chain[j+length/2];
4846 /* Create interleaving stmt:
4847 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4848 ...}> */
4849 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4850 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4851 vect2, perm_mask_high);
4852 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4853 (*result_chain)[2*j] = high;
4855 /* Create interleaving stmt:
4856 low = VEC_PERM_EXPR <vect1, vect2,
4857 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4858 ...}> */
4859 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4860 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4861 vect2, perm_mask_low);
4862 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4863 (*result_chain)[2*j+1] = low;
4865 memcpy (dr_chain.address (), result_chain->address (),
4866 length * sizeof (tree));
4871 /* Function vect_setup_realignment
4873 This function is called when vectorizing an unaligned load using
4874 the dr_explicit_realign[_optimized] scheme.
4875 This function generates the following code at the loop prolog:
4877 p = initial_addr;
4878 x msq_init = *(floor(p)); # prolog load
4879 realignment_token = call target_builtin;
4880 loop:
4881 x msq = phi (msq_init, ---)
4883 The stmts marked with x are generated only for the case of
4884 dr_explicit_realign_optimized.
4886 The code above sets up a new (vector) pointer, pointing to the first
4887 location accessed by STMT, and a "floor-aligned" load using that pointer.
4888 It also generates code to compute the "realignment-token" (if the relevant
4889 target hook was defined), and creates a phi-node at the loop-header bb
4890 whose arguments are the result of the prolog-load (created by this
4891 function) and the result of a load that takes place in the loop (to be
4892 created by the caller to this function).
4894 For the case of dr_explicit_realign_optimized:
4895 The caller to this function uses the phi-result (msq) to create the
4896 realignment code inside the loop, and sets up the missing phi argument,
4897 as follows:
4898 loop:
4899 msq = phi (msq_init, lsq)
4900 lsq = *(floor(p')); # load in loop
4901 result = realign_load (msq, lsq, realignment_token);
4903 For the case of dr_explicit_realign:
4904 loop:
4905 msq = *(floor(p)); # load in loop
4906 p' = p + (VS-1);
4907 lsq = *(floor(p')); # load in loop
4908 result = realign_load (msq, lsq, realignment_token);
4910 Input:
4911 STMT - (scalar) load stmt to be vectorized. This load accesses
4912 a memory location that may be unaligned.
4913 BSI - place where new code is to be inserted.
4914 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4915 is used.
4917 Output:
4918 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4919 target hook, if defined.
4920 Return value - the result of the loop-header phi node. */
4922 tree
4923 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4924 tree *realignment_token,
4925 enum dr_alignment_support alignment_support_scheme,
4926 tree init_addr,
4927 struct loop **at_loop)
4929 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4930 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4931 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4932 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4933 struct loop *loop = NULL;
4934 edge pe = NULL;
4935 tree scalar_dest = gimple_assign_lhs (stmt);
4936 tree vec_dest;
4937 gimple *inc;
4938 tree ptr;
4939 tree data_ref;
4940 basic_block new_bb;
4941 tree msq_init = NULL_TREE;
4942 tree new_temp;
4943 gphi *phi_stmt;
4944 tree msq = NULL_TREE;
4945 gimple_seq stmts = NULL;
4946 bool inv_p;
4947 bool compute_in_loop = false;
4948 bool nested_in_vect_loop = false;
4949 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4950 struct loop *loop_for_initial_load = NULL;
4952 if (loop_vinfo)
4954 loop = LOOP_VINFO_LOOP (loop_vinfo);
4955 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4958 gcc_assert (alignment_support_scheme == dr_explicit_realign
4959 || alignment_support_scheme == dr_explicit_realign_optimized);
4961 /* We need to generate three things:
4962 1. the misalignment computation
4963 2. the extra vector load (for the optimized realignment scheme).
4964 3. the phi node for the two vectors from which the realignment is
4965 done (for the optimized realignment scheme). */
4967 /* 1. Determine where to generate the misalignment computation.
4969 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4970 calculation will be generated by this function, outside the loop (in the
4971 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4972 caller, inside the loop.
4974 Background: If the misalignment remains fixed throughout the iterations of
4975 the loop, then both realignment schemes are applicable, and also the
4976 misalignment computation can be done outside LOOP. This is because we are
4977 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4978 are a multiple of VS (the Vector Size), and therefore the misalignment in
4979 different vectorized LOOP iterations is always the same.
4980 The problem arises only if the memory access is in an inner-loop nested
4981 inside LOOP, which is now being vectorized using outer-loop vectorization.
4982 This is the only case when the misalignment of the memory access may not
4983 remain fixed throughout the iterations of the inner-loop (as explained in
4984 detail in vect_supportable_dr_alignment). In this case, not only is the
4985 optimized realignment scheme not applicable, but also the misalignment
4986 computation (and generation of the realignment token that is passed to
4987 REALIGN_LOAD) have to be done inside the loop.
4989 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4990 or not, which in turn determines if the misalignment is computed inside
4991 the inner-loop, or outside LOOP. */
4993 if (init_addr != NULL_TREE || !loop_vinfo)
4995 compute_in_loop = true;
4996 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5000 /* 2. Determine where to generate the extra vector load.
5002 For the optimized realignment scheme, instead of generating two vector
5003 loads in each iteration, we generate a single extra vector load in the
5004 preheader of the loop, and in each iteration reuse the result of the
5005 vector load from the previous iteration. In case the memory access is in
5006 an inner-loop nested inside LOOP, which is now being vectorized using
5007 outer-loop vectorization, we need to determine whether this initial vector
5008 load should be generated at the preheader of the inner-loop, or can be
5009 generated at the preheader of LOOP. If the memory access has no evolution
5010 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5011 to be generated inside LOOP (in the preheader of the inner-loop). */
5013 if (nested_in_vect_loop)
5015 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5016 bool invariant_in_outerloop =
5017 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5018 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5020 else
5021 loop_for_initial_load = loop;
5022 if (at_loop)
5023 *at_loop = loop_for_initial_load;
5025 if (loop_for_initial_load)
5026 pe = loop_preheader_edge (loop_for_initial_load);
5028 /* 3. For the case of the optimized realignment, create the first vector
5029 load at the loop preheader. */
5031 if (alignment_support_scheme == dr_explicit_realign_optimized)
5033 /* Create msq_init = *(floor(p1)) in the loop preheader */
5034 gassign *new_stmt;
5036 gcc_assert (!compute_in_loop);
5037 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5038 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5039 NULL_TREE, &init_addr, NULL, &inc,
5040 true, &inv_p);
5041 if (TREE_CODE (ptr) == SSA_NAME)
5042 new_temp = copy_ssa_name (ptr);
5043 else
5044 new_temp = make_ssa_name (TREE_TYPE (ptr));
5045 new_stmt = gimple_build_assign
5046 (new_temp, BIT_AND_EXPR, ptr,
5047 build_int_cst (TREE_TYPE (ptr),
5048 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5049 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5050 gcc_assert (!new_bb);
5051 data_ref
5052 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5053 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5054 new_stmt = gimple_build_assign (vec_dest, data_ref);
5055 new_temp = make_ssa_name (vec_dest, new_stmt);
5056 gimple_assign_set_lhs (new_stmt, new_temp);
5057 if (pe)
5059 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5060 gcc_assert (!new_bb);
5062 else
5063 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5065 msq_init = gimple_assign_lhs (new_stmt);
5068 /* 4. Create realignment token using a target builtin, if available.
5069 It is done either inside the containing loop, or before LOOP (as
5070 determined above). */
5072 if (targetm.vectorize.builtin_mask_for_load)
5074 gcall *new_stmt;
5075 tree builtin_decl;
5077 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5078 if (!init_addr)
5080 /* Generate the INIT_ADDR computation outside LOOP. */
5081 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5082 NULL_TREE, loop);
5083 if (loop)
5085 pe = loop_preheader_edge (loop);
5086 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5087 gcc_assert (!new_bb);
5089 else
5090 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5093 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5094 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5095 vec_dest =
5096 vect_create_destination_var (scalar_dest,
5097 gimple_call_return_type (new_stmt));
5098 new_temp = make_ssa_name (vec_dest, new_stmt);
5099 gimple_call_set_lhs (new_stmt, new_temp);
5101 if (compute_in_loop)
5102 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5103 else
5105 /* Generate the misalignment computation outside LOOP. */
5106 pe = loop_preheader_edge (loop);
5107 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5108 gcc_assert (!new_bb);
5111 *realignment_token = gimple_call_lhs (new_stmt);
5113 /* The result of the CALL_EXPR to this builtin is determined from
5114 the value of the parameter and no global variables are touched
5115 which makes the builtin a "const" function. Requiring the
5116 builtin to have the "const" attribute makes it unnecessary
5117 to call mark_call_clobbered. */
5118 gcc_assert (TREE_READONLY (builtin_decl));
5121 if (alignment_support_scheme == dr_explicit_realign)
5122 return msq;
5124 gcc_assert (!compute_in_loop);
5125 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5128 /* 5. Create msq = phi <msq_init, lsq> in loop */
5130 pe = loop_preheader_edge (containing_loop);
5131 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5132 msq = make_ssa_name (vec_dest);
5133 phi_stmt = create_phi_node (msq, containing_loop->header);
5134 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5136 return msq;
5140 /* Function vect_grouped_load_supported.
5142 Returns TRUE if even and odd permutations are supported,
5143 and FALSE otherwise. */
5145 bool
5146 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5148 machine_mode mode = TYPE_MODE (vectype);
5150 /* vect_permute_load_chain requires the group size to be equal to 3 or
5151 be a power of two. */
5152 if (count != 3 && exact_log2 (count) == -1)
5154 if (dump_enabled_p ())
5155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5156 "the size of the group of accesses"
5157 " is not a power of 2 or not equal to 3\n");
5158 return false;
5161 /* Check that the permutation is supported. */
5162 if (VECTOR_MODE_P (mode))
5164 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5165 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5167 if (count == 3)
5169 unsigned int k;
5170 for (k = 0; k < 3; k++)
5172 for (i = 0; i < nelt; i++)
5173 if (3 * i + k < 2 * nelt)
5174 sel[i] = 3 * i + k;
5175 else
5176 sel[i] = 0;
5177 if (!can_vec_perm_p (mode, false, sel))
5179 if (dump_enabled_p ())
5180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5181 "shuffle of 3 loads is not supported by"
5182 " target\n");
5183 return false;
5185 for (i = 0, j = 0; i < nelt; i++)
5186 if (3 * i + k < 2 * nelt)
5187 sel[i] = i;
5188 else
5189 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5190 if (!can_vec_perm_p (mode, false, sel))
5192 if (dump_enabled_p ())
5193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5194 "shuffle of 3 loads is not supported by"
5195 " target\n");
5196 return false;
5199 return true;
5201 else
5203 /* If length is not equal to 3 then only power of 2 is supported. */
5204 gcc_assert (exact_log2 (count) != -1);
5205 for (i = 0; i < nelt; i++)
5206 sel[i] = i * 2;
5207 if (can_vec_perm_p (mode, false, sel))
5209 for (i = 0; i < nelt; i++)
5210 sel[i] = i * 2 + 1;
5211 if (can_vec_perm_p (mode, false, sel))
5212 return true;
5217 if (dump_enabled_p ())
5218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5219 "extract even/odd not supported by target\n");
5220 return false;
5223 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5224 type VECTYPE. */
5226 bool
5227 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5229 return vect_lanes_optab_supported_p ("vec_load_lanes",
5230 vec_load_lanes_optab,
5231 vectype, count);
5234 /* Function vect_permute_load_chain.
5236 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5237 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5238 the input data correctly. Return the final references for loads in
5239 RESULT_CHAIN.
5241 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5242 The input is 4 vectors each containing 8 elements. We assign a number to each
5243 element, the input sequence is:
5245 1st vec: 0 1 2 3 4 5 6 7
5246 2nd vec: 8 9 10 11 12 13 14 15
5247 3rd vec: 16 17 18 19 20 21 22 23
5248 4th vec: 24 25 26 27 28 29 30 31
5250 The output sequence should be:
5252 1st vec: 0 4 8 12 16 20 24 28
5253 2nd vec: 1 5 9 13 17 21 25 29
5254 3rd vec: 2 6 10 14 18 22 26 30
5255 4th vec: 3 7 11 15 19 23 27 31
5257 i.e., the first output vector should contain the first elements of each
5258 interleaving group, etc.
5260 We use extract_even/odd instructions to create such output. The input of
5261 each extract_even/odd operation is two vectors
5262 1st vec 2nd vec
5263 0 1 2 3 4 5 6 7
5265 and the output is the vector of extracted even/odd elements. The output of
5266 extract_even will be: 0 2 4 6
5267 and of extract_odd: 1 3 5 7
5270 The permutation is done in log LENGTH stages. In each stage extract_even
5271 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5272 their order. In our example,
5274 E1: extract_even (1st vec, 2nd vec)
5275 E2: extract_odd (1st vec, 2nd vec)
5276 E3: extract_even (3rd vec, 4th vec)
5277 E4: extract_odd (3rd vec, 4th vec)
5279 The output for the first stage will be:
5281 E1: 0 2 4 6 8 10 12 14
5282 E2: 1 3 5 7 9 11 13 15
5283 E3: 16 18 20 22 24 26 28 30
5284 E4: 17 19 21 23 25 27 29 31
5286 In order to proceed and create the correct sequence for the next stage (or
5287 for the correct output, if the second stage is the last one, as in our
5288 example), we first put the output of extract_even operation and then the
5289 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5290 The input for the second stage is:
5292 1st vec (E1): 0 2 4 6 8 10 12 14
5293 2nd vec (E3): 16 18 20 22 24 26 28 30
5294 3rd vec (E2): 1 3 5 7 9 11 13 15
5295 4th vec (E4): 17 19 21 23 25 27 29 31
5297 The output of the second stage:
5299 E1: 0 4 8 12 16 20 24 28
5300 E2: 2 6 10 14 18 22 26 30
5301 E3: 1 5 9 13 17 21 25 29
5302 E4: 3 7 11 15 19 23 27 31
5304 And RESULT_CHAIN after reordering:
5306 1st vec (E1): 0 4 8 12 16 20 24 28
5307 2nd vec (E3): 1 5 9 13 17 21 25 29
5308 3rd vec (E2): 2 6 10 14 18 22 26 30
5309 4th vec (E4): 3 7 11 15 19 23 27 31. */
5311 static void
5312 vect_permute_load_chain (vec<tree> dr_chain,
5313 unsigned int length,
5314 gimple *stmt,
5315 gimple_stmt_iterator *gsi,
5316 vec<tree> *result_chain)
5318 tree data_ref, first_vect, second_vect;
5319 tree perm_mask_even, perm_mask_odd;
5320 tree perm3_mask_low, perm3_mask_high;
5321 gimple *perm_stmt;
5322 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5323 unsigned int i, j, log_length = exact_log2 (length);
5324 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5325 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5327 result_chain->quick_grow (length);
5328 memcpy (result_chain->address (), dr_chain.address (),
5329 length * sizeof (tree));
5331 if (length == 3)
5333 unsigned int k;
5335 for (k = 0; k < 3; k++)
5337 for (i = 0; i < nelt; i++)
5338 if (3 * i + k < 2 * nelt)
5339 sel[i] = 3 * i + k;
5340 else
5341 sel[i] = 0;
5342 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5344 for (i = 0, j = 0; i < nelt; i++)
5345 if (3 * i + k < 2 * nelt)
5346 sel[i] = i;
5347 else
5348 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5350 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5352 first_vect = dr_chain[0];
5353 second_vect = dr_chain[1];
5355 /* Create interleaving stmt (low part of):
5356 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5357 ...}> */
5358 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5359 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5360 second_vect, perm3_mask_low);
5361 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5363 /* Create interleaving stmt (high part of):
5364 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5365 ...}> */
5366 first_vect = data_ref;
5367 second_vect = dr_chain[2];
5368 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5369 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5370 second_vect, perm3_mask_high);
5371 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5372 (*result_chain)[k] = data_ref;
5375 else
5377 /* If length is not equal to 3 then only power of 2 is supported. */
5378 gcc_assert (exact_log2 (length) != -1);
5380 for (i = 0; i < nelt; ++i)
5381 sel[i] = i * 2;
5382 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5384 for (i = 0; i < nelt; ++i)
5385 sel[i] = i * 2 + 1;
5386 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5388 for (i = 0; i < log_length; i++)
5390 for (j = 0; j < length; j += 2)
5392 first_vect = dr_chain[j];
5393 second_vect = dr_chain[j+1];
5395 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5396 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5397 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5398 first_vect, second_vect,
5399 perm_mask_even);
5400 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5401 (*result_chain)[j/2] = data_ref;
5403 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5404 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5405 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5406 first_vect, second_vect,
5407 perm_mask_odd);
5408 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5409 (*result_chain)[j/2+length/2] = data_ref;
5411 memcpy (dr_chain.address (), result_chain->address (),
5412 length * sizeof (tree));
5417 /* Function vect_shift_permute_load_chain.
5419 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5420 sequence of stmts to reorder the input data accordingly.
5421 Return the final references for loads in RESULT_CHAIN.
5422 Return true if successed, false otherwise.
5424 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5425 The input is 3 vectors each containing 8 elements. We assign a
5426 number to each element, the input sequence is:
5428 1st vec: 0 1 2 3 4 5 6 7
5429 2nd vec: 8 9 10 11 12 13 14 15
5430 3rd vec: 16 17 18 19 20 21 22 23
5432 The output sequence should be:
5434 1st vec: 0 3 6 9 12 15 18 21
5435 2nd vec: 1 4 7 10 13 16 19 22
5436 3rd vec: 2 5 8 11 14 17 20 23
5438 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5440 First we shuffle all 3 vectors to get correct elements order:
5442 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5443 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5444 3rd vec: (16 19 22) (17 20 23) (18 21)
5446 Next we unite and shift vector 3 times:
5448 1st step:
5449 shift right by 6 the concatenation of:
5450 "1st vec" and "2nd vec"
5451 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5452 "2nd vec" and "3rd vec"
5453 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5454 "3rd vec" and "1st vec"
5455 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5456 | New vectors |
5458 So that now new vectors are:
5460 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5461 2nd vec: (10 13) (16 19 22) (17 20 23)
5462 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5464 2nd step:
5465 shift right by 5 the concatenation of:
5466 "1st vec" and "3rd vec"
5467 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5468 "2nd vec" and "1st vec"
5469 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5470 "3rd vec" and "2nd vec"
5471 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5472 | New vectors |
5474 So that now new vectors are:
5476 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5477 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5478 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5480 3rd step:
5481 shift right by 5 the concatenation of:
5482 "1st vec" and "1st vec"
5483 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5484 shift right by 3 the concatenation of:
5485 "2nd vec" and "2nd vec"
5486 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5487 | New vectors |
5489 So that now all vectors are READY:
5490 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5491 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5492 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5494 This algorithm is faster than one in vect_permute_load_chain if:
5495 1. "shift of a concatination" is faster than general permutation.
5496 This is usually so.
5497 2. The TARGET machine can't execute vector instructions in parallel.
5498 This is because each step of the algorithm depends on previous.
5499 The algorithm in vect_permute_load_chain is much more parallel.
5501 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5504 static bool
5505 vect_shift_permute_load_chain (vec<tree> dr_chain,
5506 unsigned int length,
5507 gimple *stmt,
5508 gimple_stmt_iterator *gsi,
5509 vec<tree> *result_chain)
5511 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5512 tree perm2_mask1, perm2_mask2, perm3_mask;
5513 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5514 gimple *perm_stmt;
5516 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5517 unsigned int i;
5518 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5519 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5520 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5521 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5523 result_chain->quick_grow (length);
5524 memcpy (result_chain->address (), dr_chain.address (),
5525 length * sizeof (tree));
5527 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5529 unsigned int j, log_length = exact_log2 (length);
5530 for (i = 0; i < nelt / 2; ++i)
5531 sel[i] = i * 2;
5532 for (i = 0; i < nelt / 2; ++i)
5533 sel[nelt / 2 + i] = i * 2 + 1;
5534 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5536 if (dump_enabled_p ())
5537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5538 "shuffle of 2 fields structure is not \
5539 supported by target\n");
5540 return false;
5542 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5544 for (i = 0; i < nelt / 2; ++i)
5545 sel[i] = i * 2 + 1;
5546 for (i = 0; i < nelt / 2; ++i)
5547 sel[nelt / 2 + i] = i * 2;
5548 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5550 if (dump_enabled_p ())
5551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5552 "shuffle of 2 fields structure is not \
5553 supported by target\n");
5554 return false;
5556 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5558 /* Generating permutation constant to shift all elements.
5559 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5560 for (i = 0; i < nelt; i++)
5561 sel[i] = nelt / 2 + i;
5562 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5564 if (dump_enabled_p ())
5565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5566 "shift permutation is not supported by target\n");
5567 return false;
5569 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5571 /* Generating permutation constant to select vector from 2.
5572 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5573 for (i = 0; i < nelt / 2; i++)
5574 sel[i] = i;
5575 for (i = nelt / 2; i < nelt; i++)
5576 sel[i] = nelt + i;
5577 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5579 if (dump_enabled_p ())
5580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5581 "select is not supported by target\n");
5582 return false;
5584 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5586 for (i = 0; i < log_length; i++)
5588 for (j = 0; j < length; j += 2)
5590 first_vect = dr_chain[j];
5591 second_vect = dr_chain[j + 1];
5593 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5594 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5595 first_vect, first_vect,
5596 perm2_mask1);
5597 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5598 vect[0] = data_ref;
5600 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5601 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5602 second_vect, second_vect,
5603 perm2_mask2);
5604 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5605 vect[1] = data_ref;
5607 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5608 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5609 vect[0], vect[1], shift1_mask);
5610 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5611 (*result_chain)[j/2 + length/2] = data_ref;
5613 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5614 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5615 vect[0], vect[1], select_mask);
5616 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5617 (*result_chain)[j/2] = data_ref;
5619 memcpy (dr_chain.address (), result_chain->address (),
5620 length * sizeof (tree));
5622 return true;
5624 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5626 unsigned int k = 0, l = 0;
5628 /* Generating permutation constant to get all elements in rigth order.
5629 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5630 for (i = 0; i < nelt; i++)
5632 if (3 * k + (l % 3) >= nelt)
5634 k = 0;
5635 l += (3 - (nelt % 3));
5637 sel[i] = 3 * k + (l % 3);
5638 k++;
5640 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5642 if (dump_enabled_p ())
5643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5644 "shuffle of 3 fields structure is not \
5645 supported by target\n");
5646 return false;
5648 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5650 /* Generating permutation constant to shift all elements.
5651 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5652 for (i = 0; i < nelt; i++)
5653 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5654 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5656 if (dump_enabled_p ())
5657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5658 "shift permutation is not supported by target\n");
5659 return false;
5661 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5663 /* Generating permutation constant to shift all elements.
5664 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5665 for (i = 0; i < nelt; i++)
5666 sel[i] = 2 * (nelt / 3) + 1 + i;
5667 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5669 if (dump_enabled_p ())
5670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671 "shift permutation is not supported by target\n");
5672 return false;
5674 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5676 /* Generating permutation constant to shift all elements.
5677 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5678 for (i = 0; i < nelt; i++)
5679 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5680 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5682 if (dump_enabled_p ())
5683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5684 "shift permutation is not supported by target\n");
5685 return false;
5687 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5689 /* Generating permutation constant to shift all elements.
5690 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5691 for (i = 0; i < nelt; i++)
5692 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5693 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5695 if (dump_enabled_p ())
5696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5697 "shift permutation is not supported by target\n");
5698 return false;
5700 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5702 for (k = 0; k < 3; k++)
5704 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5705 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5706 dr_chain[k], dr_chain[k],
5707 perm3_mask);
5708 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5709 vect[k] = data_ref;
5712 for (k = 0; k < 3; k++)
5714 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5715 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5716 vect[k % 3], vect[(k + 1) % 3],
5717 shift1_mask);
5718 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5719 vect_shift[k] = data_ref;
5722 for (k = 0; k < 3; k++)
5724 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5725 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5726 vect_shift[(4 - k) % 3],
5727 vect_shift[(3 - k) % 3],
5728 shift2_mask);
5729 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5730 vect[k] = data_ref;
5733 (*result_chain)[3 - (nelt % 3)] = vect[2];
5735 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5736 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5737 vect[0], shift3_mask);
5738 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5739 (*result_chain)[nelt % 3] = data_ref;
5741 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5742 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5743 vect[1], shift4_mask);
5744 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5745 (*result_chain)[0] = data_ref;
5746 return true;
5748 return false;
5751 /* Function vect_transform_grouped_load.
5753 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5754 to perform their permutation and ascribe the result vectorized statements to
5755 the scalar statements.
5758 void
5759 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5760 gimple_stmt_iterator *gsi)
5762 machine_mode mode;
5763 vec<tree> result_chain = vNULL;
5765 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5766 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5767 vectors, that are ready for vector computation. */
5768 result_chain.create (size);
5770 /* If reassociation width for vector type is 2 or greater target machine can
5771 execute 2 or more vector instructions in parallel. Otherwise try to
5772 get chain for loads group using vect_shift_permute_load_chain. */
5773 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5774 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5775 || exact_log2 (size) != -1
5776 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5777 gsi, &result_chain))
5778 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5779 vect_record_grouped_load_vectors (stmt, result_chain);
5780 result_chain.release ();
5783 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5784 generated as part of the vectorization of STMT. Assign the statement
5785 for each vector to the associated scalar statement. */
5787 void
5788 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5790 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5791 gimple *next_stmt, *new_stmt;
5792 unsigned int i, gap_count;
5793 tree tmp_data_ref;
5795 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5796 Since we scan the chain starting from it's first node, their order
5797 corresponds the order of data-refs in RESULT_CHAIN. */
5798 next_stmt = first_stmt;
5799 gap_count = 1;
5800 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5802 if (!next_stmt)
5803 break;
5805 /* Skip the gaps. Loads created for the gaps will be removed by dead
5806 code elimination pass later. No need to check for the first stmt in
5807 the group, since it always exists.
5808 GROUP_GAP is the number of steps in elements from the previous
5809 access (if there is no gap GROUP_GAP is 1). We skip loads that
5810 correspond to the gaps. */
5811 if (next_stmt != first_stmt
5812 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5814 gap_count++;
5815 continue;
5818 while (next_stmt)
5820 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5821 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5822 copies, and we put the new vector statement in the first available
5823 RELATED_STMT. */
5824 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5825 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5826 else
5828 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5830 gimple *prev_stmt =
5831 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5832 gimple *rel_stmt =
5833 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5834 while (rel_stmt)
5836 prev_stmt = rel_stmt;
5837 rel_stmt =
5838 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5841 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5842 new_stmt;
5846 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5847 gap_count = 1;
5848 /* If NEXT_STMT accesses the same DR as the previous statement,
5849 put the same TMP_DATA_REF as its vectorized statement; otherwise
5850 get the next data-ref from RESULT_CHAIN. */
5851 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5852 break;
5857 /* Function vect_force_dr_alignment_p.
5859 Returns whether the alignment of a DECL can be forced to be aligned
5860 on ALIGNMENT bit boundary. */
5862 bool
5863 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5865 if (TREE_CODE (decl) != VAR_DECL)
5866 return false;
5868 if (decl_in_symtab_p (decl)
5869 && !symtab_node::get (decl)->can_increase_alignment_p ())
5870 return false;
5872 if (TREE_STATIC (decl))
5873 return (alignment <= MAX_OFILE_ALIGNMENT);
5874 else
5875 return (alignment <= MAX_STACK_ALIGNMENT);
5879 /* Return whether the data reference DR is supported with respect to its
5880 alignment.
5881 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5882 it is aligned, i.e., check if it is possible to vectorize it with different
5883 alignment. */
5885 enum dr_alignment_support
5886 vect_supportable_dr_alignment (struct data_reference *dr,
5887 bool check_aligned_accesses)
5889 gimple *stmt = DR_STMT (dr);
5890 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5891 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5892 machine_mode mode = TYPE_MODE (vectype);
5893 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5894 struct loop *vect_loop = NULL;
5895 bool nested_in_vect_loop = false;
5897 if (aligned_access_p (dr) && !check_aligned_accesses)
5898 return dr_aligned;
5900 /* For now assume all conditional loads/stores support unaligned
5901 access without any special code. */
5902 if (is_gimple_call (stmt)
5903 && gimple_call_internal_p (stmt)
5904 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5905 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5906 return dr_unaligned_supported;
5908 if (loop_vinfo)
5910 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5911 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5914 /* Possibly unaligned access. */
5916 /* We can choose between using the implicit realignment scheme (generating
5917 a misaligned_move stmt) and the explicit realignment scheme (generating
5918 aligned loads with a REALIGN_LOAD). There are two variants to the
5919 explicit realignment scheme: optimized, and unoptimized.
5920 We can optimize the realignment only if the step between consecutive
5921 vector loads is equal to the vector size. Since the vector memory
5922 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5923 is guaranteed that the misalignment amount remains the same throughout the
5924 execution of the vectorized loop. Therefore, we can create the
5925 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5926 at the loop preheader.
5928 However, in the case of outer-loop vectorization, when vectorizing a
5929 memory access in the inner-loop nested within the LOOP that is now being
5930 vectorized, while it is guaranteed that the misalignment of the
5931 vectorized memory access will remain the same in different outer-loop
5932 iterations, it is *not* guaranteed that is will remain the same throughout
5933 the execution of the inner-loop. This is because the inner-loop advances
5934 with the original scalar step (and not in steps of VS). If the inner-loop
5935 step happens to be a multiple of VS, then the misalignment remains fixed
5936 and we can use the optimized realignment scheme. For example:
5938 for (i=0; i<N; i++)
5939 for (j=0; j<M; j++)
5940 s += a[i+j];
5942 When vectorizing the i-loop in the above example, the step between
5943 consecutive vector loads is 1, and so the misalignment does not remain
5944 fixed across the execution of the inner-loop, and the realignment cannot
5945 be optimized (as illustrated in the following pseudo vectorized loop):
5947 for (i=0; i<N; i+=4)
5948 for (j=0; j<M; j++){
5949 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5950 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5951 // (assuming that we start from an aligned address).
5954 We therefore have to use the unoptimized realignment scheme:
5956 for (i=0; i<N; i+=4)
5957 for (j=k; j<M; j+=4)
5958 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5959 // that the misalignment of the initial address is
5960 // 0).
5962 The loop can then be vectorized as follows:
5964 for (k=0; k<4; k++){
5965 rt = get_realignment_token (&vp[k]);
5966 for (i=0; i<N; i+=4){
5967 v1 = vp[i+k];
5968 for (j=k; j<M; j+=4){
5969 v2 = vp[i+j+VS-1];
5970 va = REALIGN_LOAD <v1,v2,rt>;
5971 vs += va;
5972 v1 = v2;
5975 } */
5977 if (DR_IS_READ (dr))
5979 bool is_packed = false;
5980 tree type = (TREE_TYPE (DR_REF (dr)));
5982 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5983 && (!targetm.vectorize.builtin_mask_for_load
5984 || targetm.vectorize.builtin_mask_for_load ()))
5986 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5988 /* If we are doing SLP then the accesses need not have the
5989 same alignment, instead it depends on the SLP group size. */
5990 if (loop_vinfo
5991 && STMT_SLP_TYPE (stmt_info)
5992 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5993 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5994 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5996 else if (!loop_vinfo
5997 || (nested_in_vect_loop
5998 && (TREE_INT_CST_LOW (DR_STEP (dr))
5999 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6000 return dr_explicit_realign;
6001 else
6002 return dr_explicit_realign_optimized;
6004 if (!known_alignment_for_access_p (dr))
6005 is_packed = not_size_aligned (DR_REF (dr));
6007 if ((TYPE_USER_ALIGN (type) && !is_packed)
6008 || targetm.vectorize.
6009 support_vector_misalignment (mode, type,
6010 DR_MISALIGNMENT (dr), is_packed))
6011 /* Can't software pipeline the loads, but can at least do them. */
6012 return dr_unaligned_supported;
6014 else
6016 bool is_packed = false;
6017 tree type = (TREE_TYPE (DR_REF (dr)));
6019 if (!known_alignment_for_access_p (dr))
6020 is_packed = not_size_aligned (DR_REF (dr));
6022 if ((TYPE_USER_ALIGN (type) && !is_packed)
6023 || targetm.vectorize.
6024 support_vector_misalignment (mode, type,
6025 DR_MISALIGNMENT (dr), is_packed))
6026 return dr_unaligned_supported;
6029 /* Unsupported. */
6030 return dr_unaligned_unsupported;