Update ChangeLog and version files for release
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob565cbcbf32fb73403028bf1bb1ff0c56c9da023b
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
270 return true;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 return true;
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
344 if (dist == 0)
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
390 continue;
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
444 return true;
447 return false;
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
459 unsigned int i;
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
479 return true;
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
503 if (dra == drb)
504 return false;
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
554 group. */
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
560 continue;
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 continue;
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 if (!dr_b)
576 return false;
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
585 continue;
586 unsigned i;
587 gimple *store;
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
597 return false;
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
607 return false;
609 free_dependence_relation (ddr);
612 return true;
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632 store = NULL;
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
636 if (store)
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 return false;
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
647 bool res = true;
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
651 slp_tree load;
652 unsigned int i;
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
655 store
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
659 res = false;
660 break;
663 /* Unset the visited flag. */
664 if (store)
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
668 return res;
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
675 Output:
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
691 tree vectype;
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
694 tree aligned_to;
695 tree step;
696 unsigned HOST_WIDE_INT alignment;
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_NOTE, vect_location,
700 "vect_compute_data_ref_alignment:\n");
702 if (loop_vinfo)
703 loop = LOOP_VINFO_LOOP (loop_vinfo);
705 /* Initialize misalignment to unknown. */
706 SET_DR_MISALIGNMENT (dr, -1);
708 if (tree_fits_shwi_p (DR_STEP (dr)))
709 misalign = DR_INIT (dr);
710 aligned_to = DR_ALIGNED_TO (dr);
711 base_addr = DR_BASE_ADDRESS (dr);
712 vectype = STMT_VINFO_VECTYPE (stmt_info);
714 /* In case the dataref is in an inner-loop of the loop that is being
715 vectorized (LOOP), we use the base and misalignment information
716 relative to the outer-loop (LOOP). This is ok only if the misalignment
717 stays the same throughout the execution of the inner-loop, which is why
718 we have to check that the stride of the dataref in the inner-loop evenly
719 divides by the vector size. */
720 if (loop && nested_in_vect_loop_p (loop, stmt))
722 tree step = DR_STEP (dr);
724 if (tree_fits_shwi_p (step)
725 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
727 if (dump_enabled_p ())
728 dump_printf_loc (MSG_NOTE, vect_location,
729 "inner step divides the vector-size.\n");
730 misalign = STMT_VINFO_DR_INIT (stmt_info);
731 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
732 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
734 else
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 "inner step doesn't divide the vector-size.\n");
739 misalign = NULL_TREE;
743 /* Similarly we can only use base and misalignment information relative to
744 an innermost loop if the misalignment stays the same throughout the
745 execution of the loop. As above, this is the case if the stride of
746 the dataref evenly divides by the vector size. */
747 else
749 tree step = DR_STEP (dr);
750 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
752 if (tree_fits_shwi_p (step)
753 && ((tree_to_shwi (step) * vf)
754 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "step doesn't divide the vector-size.\n");
759 misalign = NULL_TREE;
763 /* To look at alignment of the base we have to preserve an inner MEM_REF
764 as that carries alignment information of the actual access. */
765 base = ref;
766 while (handled_component_p (base))
767 base = TREE_OPERAND (base, 0);
768 unsigned int base_alignment;
769 unsigned HOST_WIDE_INT base_bitpos;
770 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
771 /* As data-ref analysis strips the MEM_REF down to its base operand
772 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
773 adjust things to make base_alignment valid as the alignment of
774 DR_BASE_ADDRESS. */
775 if (TREE_CODE (base) == MEM_REF)
777 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
778 base_bitpos &= (base_alignment - 1);
780 if (base_bitpos != 0)
781 base_alignment = base_bitpos & -base_bitpos;
782 /* Also look at the alignment of the base address DR analysis
783 computed. */
784 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
785 if (base_addr_alignment > base_alignment)
786 base_alignment = base_addr_alignment;
788 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
789 DR_VECT_AUX (dr)->base_element_aligned = true;
791 alignment = TYPE_ALIGN_UNIT (vectype);
793 if ((compare_tree_int (aligned_to, alignment) < 0)
794 || !misalign)
796 if (dump_enabled_p ())
798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
799 "Unknown alignment for access: ");
800 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
801 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
803 return true;
806 if (base_alignment < TYPE_ALIGN (vectype))
808 base = base_addr;
809 if (TREE_CODE (base) == ADDR_EXPR)
810 base = TREE_OPERAND (base, 0);
811 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location,
816 "can't force alignment of ref: ");
817 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
818 dump_printf (MSG_NOTE, "\n");
820 return true;
823 /* Force the alignment of the decl.
824 NOTE: This is the only change to the code we make during
825 the analysis phase, before deciding to vectorize the loop. */
826 if (dump_enabled_p ())
828 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
829 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
830 dump_printf (MSG_NOTE, "\n");
833 DR_VECT_AUX (dr)->base_decl = base;
834 DR_VECT_AUX (dr)->base_misaligned = true;
835 DR_VECT_AUX (dr)->base_element_aligned = true;
838 if (loop && nested_in_vect_loop_p (loop, stmt))
839 step = STMT_VINFO_DR_STEP (stmt_info);
840 else
841 step = DR_STEP (dr);
842 /* If this is a backward running DR then first access in the larger
843 vectype actually is N-1 elements before the address in the DR.
844 Adjust misalign accordingly. */
845 if (tree_int_cst_sgn (step) < 0)
847 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
848 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
849 otherwise we wouldn't be here. */
850 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
851 /* PLUS because STEP was negative. */
852 misalign = size_binop (PLUS_EXPR, misalign, offset);
855 SET_DR_MISALIGNMENT (dr,
856 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
858 if (dump_enabled_p ())
860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
861 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
862 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
863 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
866 return true;
870 /* Function vect_update_misalignment_for_peel
872 DR - the data reference whose misalignment is to be adjusted.
873 DR_PEEL - the data reference whose misalignment is being made
874 zero in the vector loop by the peel.
875 NPEEL - the number of iterations in the peel loop if the misalignment
876 of DR_PEEL is known at compile time. */
878 static void
879 vect_update_misalignment_for_peel (struct data_reference *dr,
880 struct data_reference *dr_peel, int npeel)
882 unsigned int i;
883 vec<dr_p> same_align_drs;
884 struct data_reference *current_dr;
885 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
886 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
887 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
888 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
890 /* For interleaved data accesses the step in the loop must be multiplied by
891 the size of the interleaving group. */
892 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
893 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
894 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
895 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
897 /* It can be assumed that the data refs with the same alignment as dr_peel
898 are aligned in the vector loop. */
899 same_align_drs
900 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
901 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
903 if (current_dr != dr)
904 continue;
905 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
906 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
907 SET_DR_MISALIGNMENT (dr, 0);
908 return;
911 if (known_alignment_for_access_p (dr)
912 && known_alignment_for_access_p (dr_peel))
914 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
915 int misal = DR_MISALIGNMENT (dr);
916 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
917 misal += negative ? -npeel * dr_size : npeel * dr_size;
918 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
919 SET_DR_MISALIGNMENT (dr, misal);
920 return;
923 if (dump_enabled_p ())
924 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
925 SET_DR_MISALIGNMENT (dr, -1);
929 /* Function verify_data_ref_alignment
931 Return TRUE if DR can be handled with respect to alignment. */
933 static bool
934 verify_data_ref_alignment (data_reference_p dr)
936 enum dr_alignment_support supportable_dr_alignment
937 = vect_supportable_dr_alignment (dr, false);
938 if (!supportable_dr_alignment)
940 if (dump_enabled_p ())
942 if (DR_IS_READ (dr))
943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
944 "not vectorized: unsupported unaligned load.");
945 else
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "not vectorized: unsupported unaligned "
948 "store.");
950 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
951 DR_REF (dr));
952 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
954 return false;
957 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
958 dump_printf_loc (MSG_NOTE, vect_location,
959 "Vectorizing an unaligned access.\n");
961 return true;
964 /* Function vect_verify_datarefs_alignment
966 Return TRUE if all data references in the loop can be
967 handled with respect to alignment. */
969 bool
970 vect_verify_datarefs_alignment (loop_vec_info vinfo)
972 vec<data_reference_p> datarefs = vinfo->datarefs;
973 struct data_reference *dr;
974 unsigned int i;
976 FOR_EACH_VEC_ELT (datarefs, i, dr)
978 gimple *stmt = DR_STMT (dr);
979 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
981 if (!STMT_VINFO_RELEVANT_P (stmt_info))
982 continue;
984 /* For interleaving, only the alignment of the first access matters. */
985 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
986 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
987 continue;
989 /* Strided accesses perform only component accesses, alignment is
990 irrelevant for them. */
991 if (STMT_VINFO_STRIDED_P (stmt_info)
992 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
993 continue;
995 if (! verify_data_ref_alignment (dr))
996 return false;
999 return true;
1002 /* Given an memory reference EXP return whether its alignment is less
1003 than its size. */
1005 static bool
1006 not_size_aligned (tree exp)
1008 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1009 return true;
1011 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1012 > get_object_alignment (exp));
1015 /* Function vector_alignment_reachable_p
1017 Return true if vector alignment for DR is reachable by peeling
1018 a few loop iterations. Return false otherwise. */
1020 static bool
1021 vector_alignment_reachable_p (struct data_reference *dr)
1023 gimple *stmt = DR_STMT (dr);
1024 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1025 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1027 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1029 /* For interleaved access we peel only if number of iterations in
1030 the prolog loop ({VF - misalignment}), is a multiple of the
1031 number of the interleaved accesses. */
1032 int elem_size, mis_in_elements;
1033 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1035 /* FORNOW: handle only known alignment. */
1036 if (!known_alignment_for_access_p (dr))
1037 return false;
1039 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1040 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1042 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1043 return false;
1046 /* If misalignment is known at the compile time then allow peeling
1047 only if natural alignment is reachable through peeling. */
1048 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1050 HOST_WIDE_INT elmsize =
1051 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1052 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_NOTE, vect_location,
1055 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1056 dump_printf (MSG_NOTE,
1057 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1059 if (DR_MISALIGNMENT (dr) % elmsize)
1061 if (dump_enabled_p ())
1062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063 "data size does not divide the misalignment.\n");
1064 return false;
1068 if (!known_alignment_for_access_p (dr))
1070 tree type = TREE_TYPE (DR_REF (dr));
1071 bool is_packed = not_size_aligned (DR_REF (dr));
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 "Unknown misalignment, is_packed = %d\n",is_packed);
1075 if ((TYPE_USER_ALIGN (type) && !is_packed)
1076 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1077 return true;
1078 else
1079 return false;
1082 return true;
1086 /* Calculate the cost of the memory access represented by DR. */
1088 static void
1089 vect_get_data_access_cost (struct data_reference *dr,
1090 unsigned int *inside_cost,
1091 unsigned int *outside_cost,
1092 stmt_vector_for_cost *body_cost_vec)
1094 gimple *stmt = DR_STMT (dr);
1095 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1096 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1097 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1098 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1099 int ncopies = vf / nunits;
1101 if (DR_IS_READ (dr))
1102 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1103 NULL, body_cost_vec, false);
1104 else
1105 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1107 if (dump_enabled_p ())
1108 dump_printf_loc (MSG_NOTE, vect_location,
1109 "vect_get_data_access_cost: inside_cost = %d, "
1110 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1114 typedef struct _vect_peel_info
1116 int npeel;
1117 struct data_reference *dr;
1118 unsigned int count;
1119 } *vect_peel_info;
1121 typedef struct _vect_peel_extended_info
1123 struct _vect_peel_info peel_info;
1124 unsigned int inside_cost;
1125 unsigned int outside_cost;
1126 stmt_vector_for_cost body_cost_vec;
1127 } *vect_peel_extended_info;
1130 /* Peeling hashtable helpers. */
1132 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1134 static inline hashval_t hash (const _vect_peel_info *);
1135 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1138 inline hashval_t
1139 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1141 return (hashval_t) peel_info->npeel;
1144 inline bool
1145 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1147 return (a->npeel == b->npeel);
1151 /* Insert DR into peeling hash table with NPEEL as key. */
1153 static void
1154 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1155 loop_vec_info loop_vinfo, struct data_reference *dr,
1156 int npeel)
1158 struct _vect_peel_info elem, *slot;
1159 _vect_peel_info **new_slot;
1160 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1162 elem.npeel = npeel;
1163 slot = peeling_htab->find (&elem);
1164 if (slot)
1165 slot->count++;
1166 else
1168 slot = XNEW (struct _vect_peel_info);
1169 slot->npeel = npeel;
1170 slot->dr = dr;
1171 slot->count = 1;
1172 new_slot = peeling_htab->find_slot (slot, INSERT);
1173 *new_slot = slot;
1176 if (!supportable_dr_alignment
1177 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1178 slot->count += VECT_MAX_COST;
1182 /* Traverse peeling hash table to find peeling option that aligns maximum
1183 number of data accesses. */
1186 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1187 _vect_peel_extended_info *max)
1189 vect_peel_info elem = *slot;
1191 if (elem->count > max->peel_info.count
1192 || (elem->count == max->peel_info.count
1193 && max->peel_info.npeel > elem->npeel))
1195 max->peel_info.npeel = elem->npeel;
1196 max->peel_info.count = elem->count;
1197 max->peel_info.dr = elem->dr;
1200 return 1;
1204 /* Traverse peeling hash table and calculate cost for each peeling option.
1205 Find the one with the lowest cost. */
1208 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1209 _vect_peel_extended_info *min)
1211 vect_peel_info elem = *slot;
1212 int save_misalignment, dummy;
1213 unsigned int inside_cost = 0, outside_cost = 0, i;
1214 gimple *stmt = DR_STMT (elem->dr);
1215 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1216 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1217 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1218 struct data_reference *dr;
1219 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1221 prologue_cost_vec.create (2);
1222 body_cost_vec.create (2);
1223 epilogue_cost_vec.create (2);
1225 FOR_EACH_VEC_ELT (datarefs, i, dr)
1227 stmt = DR_STMT (dr);
1228 stmt_info = vinfo_for_stmt (stmt);
1229 /* For interleaving, only the alignment of the first access
1230 matters. */
1231 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1232 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1233 continue;
1235 /* Strided accesses perform only component accesses, alignment is
1236 irrelevant for them. */
1237 if (STMT_VINFO_STRIDED_P (stmt_info)
1238 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1239 continue;
1241 save_misalignment = DR_MISALIGNMENT (dr);
1242 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1243 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1244 &body_cost_vec);
1245 SET_DR_MISALIGNMENT (dr, save_misalignment);
1248 outside_cost += vect_get_known_peeling_cost
1249 (loop_vinfo, elem->npeel, &dummy,
1250 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1251 &prologue_cost_vec, &epilogue_cost_vec);
1253 /* Prologue and epilogue costs are added to the target model later.
1254 These costs depend only on the scalar iteration cost, the
1255 number of peeling iterations finally chosen, and the number of
1256 misaligned statements. So discard the information found here. */
1257 prologue_cost_vec.release ();
1258 epilogue_cost_vec.release ();
1260 if (inside_cost < min->inside_cost
1261 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1263 min->inside_cost = inside_cost;
1264 min->outside_cost = outside_cost;
1265 min->body_cost_vec.release ();
1266 min->body_cost_vec = body_cost_vec;
1267 min->peel_info.dr = elem->dr;
1268 min->peel_info.npeel = elem->npeel;
1270 else
1271 body_cost_vec.release ();
1273 return 1;
1277 /* Choose best peeling option by traversing peeling hash table and either
1278 choosing an option with the lowest cost (if cost model is enabled) or the
1279 option that aligns as many accesses as possible. */
1281 static struct data_reference *
1282 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1283 loop_vec_info loop_vinfo,
1284 unsigned int *npeel,
1285 stmt_vector_for_cost *body_cost_vec)
1287 struct _vect_peel_extended_info res;
1289 res.peel_info.dr = NULL;
1290 res.body_cost_vec = stmt_vector_for_cost ();
1292 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1294 res.inside_cost = INT_MAX;
1295 res.outside_cost = INT_MAX;
1296 peeling_htab->traverse <_vect_peel_extended_info *,
1297 vect_peeling_hash_get_lowest_cost> (&res);
1299 else
1301 res.peel_info.count = 0;
1302 peeling_htab->traverse <_vect_peel_extended_info *,
1303 vect_peeling_hash_get_most_frequent> (&res);
1306 *npeel = res.peel_info.npeel;
1307 *body_cost_vec = res.body_cost_vec;
1308 return res.peel_info.dr;
1312 /* Function vect_enhance_data_refs_alignment
1314 This pass will use loop versioning and loop peeling in order to enhance
1315 the alignment of data references in the loop.
1317 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1318 original loop is to be vectorized. Any other loops that are created by
1319 the transformations performed in this pass - are not supposed to be
1320 vectorized. This restriction will be relaxed.
1322 This pass will require a cost model to guide it whether to apply peeling
1323 or versioning or a combination of the two. For example, the scheme that
1324 intel uses when given a loop with several memory accesses, is as follows:
1325 choose one memory access ('p') which alignment you want to force by doing
1326 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1327 other accesses are not necessarily aligned, or (2) use loop versioning to
1328 generate one loop in which all accesses are aligned, and another loop in
1329 which only 'p' is necessarily aligned.
1331 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1332 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1333 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1335 Devising a cost model is the most critical aspect of this work. It will
1336 guide us on which access to peel for, whether to use loop versioning, how
1337 many versions to create, etc. The cost model will probably consist of
1338 generic considerations as well as target specific considerations (on
1339 powerpc for example, misaligned stores are more painful than misaligned
1340 loads).
1342 Here are the general steps involved in alignment enhancements:
1344 -- original loop, before alignment analysis:
1345 for (i=0; i<N; i++){
1346 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1347 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1350 -- After vect_compute_data_refs_alignment:
1351 for (i=0; i<N; i++){
1352 x = q[i]; # DR_MISALIGNMENT(q) = 3
1353 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1356 -- Possibility 1: we do loop versioning:
1357 if (p is aligned) {
1358 for (i=0; i<N; i++){ # loop 1A
1359 x = q[i]; # DR_MISALIGNMENT(q) = 3
1360 p[i] = y; # DR_MISALIGNMENT(p) = 0
1363 else {
1364 for (i=0; i<N; i++){ # loop 1B
1365 x = q[i]; # DR_MISALIGNMENT(q) = 3
1366 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1370 -- Possibility 2: we do loop peeling:
1371 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1372 x = q[i];
1373 p[i] = y;
1375 for (i = 3; i < N; i++){ # loop 2A
1376 x = q[i]; # DR_MISALIGNMENT(q) = 0
1377 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1380 -- Possibility 3: combination of loop peeling and versioning:
1381 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1382 x = q[i];
1383 p[i] = y;
1385 if (p is aligned) {
1386 for (i = 3; i<N; i++){ # loop 3A
1387 x = q[i]; # DR_MISALIGNMENT(q) = 0
1388 p[i] = y; # DR_MISALIGNMENT(p) = 0
1391 else {
1392 for (i = 3; i<N; i++){ # loop 3B
1393 x = q[i]; # DR_MISALIGNMENT(q) = 0
1394 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1398 These loops are later passed to loop_transform to be vectorized. The
1399 vectorizer will use the alignment information to guide the transformation
1400 (whether to generate regular loads/stores, or with special handling for
1401 misalignment). */
1403 bool
1404 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1406 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1407 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1408 enum dr_alignment_support supportable_dr_alignment;
1409 struct data_reference *dr0 = NULL, *first_store = NULL;
1410 struct data_reference *dr;
1411 unsigned int i, j;
1412 bool do_peeling = false;
1413 bool do_versioning = false;
1414 bool stat;
1415 gimple *stmt;
1416 stmt_vec_info stmt_info;
1417 unsigned int npeel = 0;
1418 bool all_misalignments_unknown = true;
1419 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420 unsigned possible_npeel_number = 1;
1421 tree vectype;
1422 unsigned int nelements, mis, same_align_drs_max = 0;
1423 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1424 hash_table<peel_info_hasher> peeling_htab (1);
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE, vect_location,
1428 "=== vect_enhance_data_refs_alignment ===\n");
1430 /* Reset data so we can safely be called multiple times. */
1431 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1432 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1434 /* While cost model enhancements are expected in the future, the high level
1435 view of the code at this time is as follows:
1437 A) If there is a misaligned access then see if peeling to align
1438 this access can make all data references satisfy
1439 vect_supportable_dr_alignment. If so, update data structures
1440 as needed and return true.
1442 B) If peeling wasn't possible and there is a data reference with an
1443 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1444 then see if loop versioning checks can be used to make all data
1445 references satisfy vect_supportable_dr_alignment. If so, update
1446 data structures as needed and return true.
1448 C) If neither peeling nor versioning were successful then return false if
1449 any data reference does not satisfy vect_supportable_dr_alignment.
1451 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1453 Note, Possibility 3 above (which is peeling and versioning together) is not
1454 being done at this time. */
1456 /* (1) Peeling to force alignment. */
1458 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1459 Considerations:
1460 + How many accesses will become aligned due to the peeling
1461 - How many accesses will become unaligned due to the peeling,
1462 and the cost of misaligned accesses.
1463 - The cost of peeling (the extra runtime checks, the increase
1464 in code size). */
1466 FOR_EACH_VEC_ELT (datarefs, i, dr)
1468 stmt = DR_STMT (dr);
1469 stmt_info = vinfo_for_stmt (stmt);
1471 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1472 continue;
1474 /* For interleaving, only the alignment of the first access
1475 matters. */
1476 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1477 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1478 continue;
1480 /* For invariant accesses there is nothing to enhance. */
1481 if (integer_zerop (DR_STEP (dr)))
1482 continue;
1484 /* Strided accesses perform only component accesses, alignment is
1485 irrelevant for them. */
1486 if (STMT_VINFO_STRIDED_P (stmt_info)
1487 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1488 continue;
1490 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1491 do_peeling = vector_alignment_reachable_p (dr);
1492 if (do_peeling)
1494 if (known_alignment_for_access_p (dr))
1496 unsigned int npeel_tmp;
1497 bool negative = tree_int_cst_compare (DR_STEP (dr),
1498 size_zero_node) < 0;
1500 /* Save info about DR in the hash table. */
1501 vectype = STMT_VINFO_VECTYPE (stmt_info);
1502 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1503 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1504 TREE_TYPE (DR_REF (dr))));
1505 npeel_tmp = (negative
1506 ? (mis - nelements) : (nelements - mis))
1507 & (nelements - 1);
1509 /* For multiple types, it is possible that the bigger type access
1510 will have more than one peeling option. E.g., a loop with two
1511 types: one of size (vector size / 4), and the other one of
1512 size (vector size / 8). Vectorization factor will 8. If both
1513 access are misaligned by 3, the first one needs one scalar
1514 iteration to be aligned, and the second one needs 5. But the
1515 first one will be aligned also by peeling 5 scalar
1516 iterations, and in that case both accesses will be aligned.
1517 Hence, except for the immediate peeling amount, we also want
1518 to try to add full vector size, while we don't exceed
1519 vectorization factor.
1520 We do this automtically for cost model, since we calculate cost
1521 for every peeling option. */
1522 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1524 if (STMT_SLP_TYPE (stmt_info))
1525 possible_npeel_number
1526 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1527 else
1528 possible_npeel_number = vf / nelements;
1531 /* Handle the aligned case. We may decide to align some other
1532 access, making DR unaligned. */
1533 if (DR_MISALIGNMENT (dr) == 0)
1535 npeel_tmp = 0;
1536 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1537 possible_npeel_number++;
1540 for (j = 0; j < possible_npeel_number; j++)
1542 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1543 dr, npeel_tmp);
1544 npeel_tmp += nelements;
1547 all_misalignments_unknown = false;
1548 /* Data-ref that was chosen for the case that all the
1549 misalignments are unknown is not relevant anymore, since we
1550 have a data-ref with known alignment. */
1551 dr0 = NULL;
1553 else
1555 /* If we don't know any misalignment values, we prefer
1556 peeling for data-ref that has the maximum number of data-refs
1557 with the same alignment, unless the target prefers to align
1558 stores over load. */
1559 if (all_misalignments_unknown)
1561 unsigned same_align_drs
1562 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1563 if (!dr0
1564 || same_align_drs_max < same_align_drs)
1566 same_align_drs_max = same_align_drs;
1567 dr0 = dr;
1569 /* For data-refs with the same number of related
1570 accesses prefer the one where the misalign
1571 computation will be invariant in the outermost loop. */
1572 else if (same_align_drs_max == same_align_drs)
1574 struct loop *ivloop0, *ivloop;
1575 ivloop0 = outermost_invariant_loop_for_expr
1576 (loop, DR_BASE_ADDRESS (dr0));
1577 ivloop = outermost_invariant_loop_for_expr
1578 (loop, DR_BASE_ADDRESS (dr));
1579 if ((ivloop && !ivloop0)
1580 || (ivloop && ivloop0
1581 && flow_loop_nested_p (ivloop, ivloop0)))
1582 dr0 = dr;
1585 if (!first_store && DR_IS_WRITE (dr))
1586 first_store = dr;
1589 /* If there are both known and unknown misaligned accesses in the
1590 loop, we choose peeling amount according to the known
1591 accesses. */
1592 if (!supportable_dr_alignment)
1594 dr0 = dr;
1595 if (!first_store && DR_IS_WRITE (dr))
1596 first_store = dr;
1600 else
1602 if (!aligned_access_p (dr))
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "vector alignment may not be reachable\n");
1607 break;
1612 /* Check if we can possibly peel the loop. */
1613 if (!vect_can_advance_ivs_p (loop_vinfo)
1614 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1615 || loop->inner)
1616 do_peeling = false;
1618 if (do_peeling
1619 && all_misalignments_unknown
1620 && vect_supportable_dr_alignment (dr0, false))
1622 /* Check if the target requires to prefer stores over loads, i.e., if
1623 misaligned stores are more expensive than misaligned loads (taking
1624 drs with same alignment into account). */
1625 if (first_store && DR_IS_READ (dr0))
1627 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1628 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1629 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1630 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1631 stmt_vector_for_cost dummy;
1632 dummy.create (2);
1634 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1635 &dummy);
1636 vect_get_data_access_cost (first_store, &store_inside_cost,
1637 &store_outside_cost, &dummy);
1639 dummy.release ();
1641 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1642 aligning the load DR0). */
1643 load_inside_penalty = store_inside_cost;
1644 load_outside_penalty = store_outside_cost;
1645 for (i = 0;
1646 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1647 DR_STMT (first_store))).iterate (i, &dr);
1648 i++)
1649 if (DR_IS_READ (dr))
1651 load_inside_penalty += load_inside_cost;
1652 load_outside_penalty += load_outside_cost;
1654 else
1656 load_inside_penalty += store_inside_cost;
1657 load_outside_penalty += store_outside_cost;
1660 /* Calculate the penalty for leaving DR0 unaligned (by
1661 aligning the FIRST_STORE). */
1662 store_inside_penalty = load_inside_cost;
1663 store_outside_penalty = load_outside_cost;
1664 for (i = 0;
1665 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1666 DR_STMT (dr0))).iterate (i, &dr);
1667 i++)
1668 if (DR_IS_READ (dr))
1670 store_inside_penalty += load_inside_cost;
1671 store_outside_penalty += load_outside_cost;
1673 else
1675 store_inside_penalty += store_inside_cost;
1676 store_outside_penalty += store_outside_cost;
1679 if (load_inside_penalty > store_inside_penalty
1680 || (load_inside_penalty == store_inside_penalty
1681 && load_outside_penalty > store_outside_penalty))
1682 dr0 = first_store;
1685 /* In case there are only loads with different unknown misalignments, use
1686 peeling only if it may help to align other accesses in the loop or
1687 if it may help improving load bandwith when we'd end up using
1688 unaligned loads. */
1689 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1690 if (!first_store
1691 && !STMT_VINFO_SAME_ALIGN_REFS (
1692 vinfo_for_stmt (DR_STMT (dr0))).length ()
1693 && (vect_supportable_dr_alignment (dr0, false)
1694 != dr_unaligned_supported
1695 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1696 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1697 do_peeling = false;
1700 if (do_peeling && !dr0)
1702 /* Peeling is possible, but there is no data access that is not supported
1703 unless aligned. So we try to choose the best possible peeling. */
1705 /* We should get here only if there are drs with known misalignment. */
1706 gcc_assert (!all_misalignments_unknown);
1708 /* Choose the best peeling from the hash table. */
1709 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1710 loop_vinfo, &npeel,
1711 &body_cost_vec);
1712 if (!dr0 || !npeel)
1713 do_peeling = false;
1716 if (do_peeling)
1718 stmt = DR_STMT (dr0);
1719 stmt_info = vinfo_for_stmt (stmt);
1720 vectype = STMT_VINFO_VECTYPE (stmt_info);
1721 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1723 if (known_alignment_for_access_p (dr0))
1725 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1726 size_zero_node) < 0;
1727 if (!npeel)
1729 /* Since it's known at compile time, compute the number of
1730 iterations in the peeled loop (the peeling factor) for use in
1731 updating DR_MISALIGNMENT values. The peeling factor is the
1732 vectorization factor minus the misalignment as an element
1733 count. */
1734 mis = DR_MISALIGNMENT (dr0);
1735 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1736 npeel = ((negative ? mis - nelements : nelements - mis)
1737 & (nelements - 1));
1740 /* For interleaved data access every iteration accesses all the
1741 members of the group, therefore we divide the number of iterations
1742 by the group size. */
1743 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1744 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1745 npeel /= GROUP_SIZE (stmt_info);
1747 if (dump_enabled_p ())
1748 dump_printf_loc (MSG_NOTE, vect_location,
1749 "Try peeling by %d\n", npeel);
1752 /* Ensure that all data refs can be vectorized after the peel. */
1753 FOR_EACH_VEC_ELT (datarefs, i, dr)
1755 int save_misalignment;
1757 if (dr == dr0)
1758 continue;
1760 stmt = DR_STMT (dr);
1761 stmt_info = vinfo_for_stmt (stmt);
1762 /* For interleaving, only the alignment of the first access
1763 matters. */
1764 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1765 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1766 continue;
1768 /* Strided accesses perform only component accesses, alignment is
1769 irrelevant for them. */
1770 if (STMT_VINFO_STRIDED_P (stmt_info)
1771 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1772 continue;
1774 save_misalignment = DR_MISALIGNMENT (dr);
1775 vect_update_misalignment_for_peel (dr, dr0, npeel);
1776 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1777 SET_DR_MISALIGNMENT (dr, save_misalignment);
1779 if (!supportable_dr_alignment)
1781 do_peeling = false;
1782 break;
1786 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1788 stat = vect_verify_datarefs_alignment (loop_vinfo);
1789 if (!stat)
1790 do_peeling = false;
1791 else
1793 body_cost_vec.release ();
1794 return stat;
1798 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1799 if (do_peeling)
1801 unsigned max_allowed_peel
1802 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1803 if (max_allowed_peel != (unsigned)-1)
1805 unsigned max_peel = npeel;
1806 if (max_peel == 0)
1808 gimple *dr_stmt = DR_STMT (dr0);
1809 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1810 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1811 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1813 if (max_peel > max_allowed_peel)
1815 do_peeling = false;
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_NOTE, vect_location,
1818 "Disable peeling, max peels reached: %d\n", max_peel);
1823 /* Cost model #2 - if peeling may result in a remaining loop not
1824 iterating enough to be vectorized then do not peel. */
1825 if (do_peeling
1826 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1828 unsigned max_peel
1829 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1830 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1831 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1832 do_peeling = false;
1835 if (do_peeling)
1837 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1838 If the misalignment of DR_i is identical to that of dr0 then set
1839 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1840 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1841 by the peeling factor times the element size of DR_i (MOD the
1842 vectorization factor times the size). Otherwise, the
1843 misalignment of DR_i must be set to unknown. */
1844 FOR_EACH_VEC_ELT (datarefs, i, dr)
1845 if (dr != dr0)
1847 /* Strided accesses perform only component accesses, alignment
1848 is irrelevant for them. */
1849 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1850 if (STMT_VINFO_STRIDED_P (stmt_info)
1851 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1852 continue;
1854 vect_update_misalignment_for_peel (dr, dr0, npeel);
1857 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1858 if (npeel)
1859 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1860 else
1861 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1862 = DR_MISALIGNMENT (dr0);
1863 SET_DR_MISALIGNMENT (dr0, 0);
1864 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_NOTE, vect_location,
1867 "Alignment of access forced using peeling.\n");
1868 dump_printf_loc (MSG_NOTE, vect_location,
1869 "Peeling for alignment will be applied.\n");
1871 /* The inside-loop cost will be accounted for in vectorizable_load
1872 and vectorizable_store correctly with adjusted alignments.
1873 Drop the body_cst_vec on the floor here. */
1874 body_cost_vec.release ();
1876 stat = vect_verify_datarefs_alignment (loop_vinfo);
1877 gcc_assert (stat);
1878 return stat;
1882 body_cost_vec.release ();
1884 /* (2) Versioning to force alignment. */
1886 /* Try versioning if:
1887 1) optimize loop for speed
1888 2) there is at least one unsupported misaligned data ref with an unknown
1889 misalignment, and
1890 3) all misaligned data refs with a known misalignment are supported, and
1891 4) the number of runtime alignment checks is within reason. */
1893 do_versioning =
1894 optimize_loop_nest_for_speed_p (loop)
1895 && (!loop->inner); /* FORNOW */
1897 if (do_versioning)
1899 FOR_EACH_VEC_ELT (datarefs, i, dr)
1901 stmt = DR_STMT (dr);
1902 stmt_info = vinfo_for_stmt (stmt);
1904 /* For interleaving, only the alignment of the first access
1905 matters. */
1906 if (aligned_access_p (dr)
1907 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1908 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1909 continue;
1911 if (STMT_VINFO_STRIDED_P (stmt_info))
1913 /* Strided loads perform only component accesses, alignment is
1914 irrelevant for them. */
1915 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1916 continue;
1917 do_versioning = false;
1918 break;
1921 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1923 if (!supportable_dr_alignment)
1925 gimple *stmt;
1926 int mask;
1927 tree vectype;
1929 if (known_alignment_for_access_p (dr)
1930 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1931 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1933 do_versioning = false;
1934 break;
1937 stmt = DR_STMT (dr);
1938 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1939 gcc_assert (vectype);
1941 /* The rightmost bits of an aligned address must be zeros.
1942 Construct the mask needed for this test. For example,
1943 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1944 mask must be 15 = 0xf. */
1945 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1947 /* FORNOW: use the same mask to test all potentially unaligned
1948 references in the loop. The vectorizer currently supports
1949 a single vector size, see the reference to
1950 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1951 vectorization factor is computed. */
1952 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1953 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1954 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1955 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1956 DR_STMT (dr));
1960 /* Versioning requires at least one misaligned data reference. */
1961 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1962 do_versioning = false;
1963 else if (!do_versioning)
1964 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1967 if (do_versioning)
1969 vec<gimple *> may_misalign_stmts
1970 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1971 gimple *stmt;
1973 /* It can now be assumed that the data references in the statements
1974 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1975 of the loop being vectorized. */
1976 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1978 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1979 dr = STMT_VINFO_DATA_REF (stmt_info);
1980 SET_DR_MISALIGNMENT (dr, 0);
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_NOTE, vect_location,
1983 "Alignment of access forced using versioning.\n");
1986 if (dump_enabled_p ())
1987 dump_printf_loc (MSG_NOTE, vect_location,
1988 "Versioning for alignment will be applied.\n");
1990 /* Peeling and versioning can't be done together at this time. */
1991 gcc_assert (! (do_peeling && do_versioning));
1993 stat = vect_verify_datarefs_alignment (loop_vinfo);
1994 gcc_assert (stat);
1995 return stat;
1998 /* This point is reached if neither peeling nor versioning is being done. */
1999 gcc_assert (! (do_peeling || do_versioning));
2001 stat = vect_verify_datarefs_alignment (loop_vinfo);
2002 return stat;
2006 /* Function vect_find_same_alignment_drs.
2008 Update group and alignment relations according to the chosen
2009 vectorization factor. */
2011 static void
2012 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2013 loop_vec_info loop_vinfo)
2015 unsigned int i;
2016 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2017 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2018 struct data_reference *dra = DDR_A (ddr);
2019 struct data_reference *drb = DDR_B (ddr);
2020 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2021 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2022 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2023 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2024 lambda_vector dist_v;
2025 unsigned int loop_depth;
2027 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2028 return;
2030 if (dra == drb)
2031 return;
2033 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2034 return;
2036 /* Loop-based vectorization and known data dependence. */
2037 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2038 return;
2040 /* Data-dependence analysis reports a distance vector of zero
2041 for data-references that overlap only in the first iteration
2042 but have different sign step (see PR45764).
2043 So as a sanity check require equal DR_STEP. */
2044 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2045 return;
2047 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2048 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2050 int dist = dist_v[loop_depth];
2052 if (dump_enabled_p ())
2053 dump_printf_loc (MSG_NOTE, vect_location,
2054 "dependence distance = %d.\n", dist);
2056 /* Same loop iteration. */
2057 if (dist == 0
2058 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2060 /* Two references with distance zero have the same alignment. */
2061 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2062 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2063 if (dump_enabled_p ())
2065 dump_printf_loc (MSG_NOTE, vect_location,
2066 "accesses have the same alignment.\n");
2067 dump_printf (MSG_NOTE,
2068 "dependence distance modulo vf == 0 between ");
2069 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2070 dump_printf (MSG_NOTE, " and ");
2071 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2072 dump_printf (MSG_NOTE, "\n");
2079 /* Function vect_analyze_data_refs_alignment
2081 Analyze the alignment of the data-references in the loop.
2082 Return FALSE if a data reference is found that cannot be vectorized. */
2084 bool
2085 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_NOTE, vect_location,
2089 "=== vect_analyze_data_refs_alignment ===\n");
2091 /* Mark groups of data references with same alignment using
2092 data dependence information. */
2093 vec<ddr_p> ddrs = vinfo->ddrs;
2094 struct data_dependence_relation *ddr;
2095 unsigned int i;
2097 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2098 vect_find_same_alignment_drs (ddr, vinfo);
2100 vec<data_reference_p> datarefs = vinfo->datarefs;
2101 struct data_reference *dr;
2103 FOR_EACH_VEC_ELT (datarefs, i, dr)
2105 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2106 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2107 && !vect_compute_data_ref_alignment (dr))
2109 /* Strided accesses perform only component accesses, misalignment
2110 information is irrelevant for them. */
2111 if (STMT_VINFO_STRIDED_P (stmt_info)
2112 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2113 continue;
2115 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117 "not vectorized: can't calculate alignment "
2118 "for data ref.\n");
2120 return false;
2124 return true;
2128 /* Analyze alignment of DRs of stmts in NODE. */
2130 static bool
2131 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2133 /* We vectorize from the first scalar stmt in the node unless
2134 the node is permuted in which case we start from the first
2135 element in the group. */
2136 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2137 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2138 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2139 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2141 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2142 if (! vect_compute_data_ref_alignment (dr)
2143 /* For creating the data-ref pointer we need alignment of the
2144 first element anyway. */
2145 || (dr != first_dr
2146 && ! vect_compute_data_ref_alignment (first_dr))
2147 || ! verify_data_ref_alignment (dr))
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "not vectorized: bad data alignment in basic "
2152 "block.\n");
2153 return false;
2156 return true;
2159 /* Function vect_slp_analyze_instance_alignment
2161 Analyze the alignment of the data-references in the SLP instance.
2162 Return FALSE if a data reference is found that cannot be vectorized. */
2164 bool
2165 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2167 if (dump_enabled_p ())
2168 dump_printf_loc (MSG_NOTE, vect_location,
2169 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2171 slp_tree node;
2172 unsigned i;
2173 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2174 if (! vect_slp_analyze_and_verify_node_alignment (node))
2175 return false;
2177 node = SLP_INSTANCE_TREE (instance);
2178 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2179 && ! vect_slp_analyze_and_verify_node_alignment
2180 (SLP_INSTANCE_TREE (instance)))
2181 return false;
2183 return true;
2187 /* Analyze groups of accesses: check that DR belongs to a group of
2188 accesses of legal size, step, etc. Detect gaps, single element
2189 interleaving, and other special cases. Set grouped access info.
2190 Collect groups of strided stores for further use in SLP analysis.
2191 Worker for vect_analyze_group_access. */
2193 static bool
2194 vect_analyze_group_access_1 (struct data_reference *dr)
2196 tree step = DR_STEP (dr);
2197 tree scalar_type = TREE_TYPE (DR_REF (dr));
2198 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2199 gimple *stmt = DR_STMT (dr);
2200 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2201 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2202 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2203 HOST_WIDE_INT dr_step = -1;
2204 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2205 bool slp_impossible = false;
2207 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2208 size of the interleaving group (including gaps). */
2209 if (tree_fits_shwi_p (step))
2211 dr_step = tree_to_shwi (step);
2212 /* Check that STEP is a multiple of type size. Otherwise there is
2213 a non-element-sized gap at the end of the group which we
2214 cannot represent in GROUP_GAP or GROUP_SIZE.
2215 ??? As we can handle non-constant step fine here we should
2216 simply remove uses of GROUP_GAP between the last and first
2217 element and instead rely on DR_STEP. GROUP_SIZE then would
2218 simply not include that gap. */
2219 if ((dr_step % type_size) != 0)
2221 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2224 "Step ");
2225 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2226 dump_printf (MSG_NOTE,
2227 " is not a multiple of the element size for ");
2228 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2229 dump_printf (MSG_NOTE, "\n");
2231 return false;
2233 groupsize = absu_hwi (dr_step) / type_size;
2235 else
2236 groupsize = 0;
2238 /* Not consecutive access is possible only if it is a part of interleaving. */
2239 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2241 /* Check if it this DR is a part of interleaving, and is a single
2242 element of the group that is accessed in the loop. */
2244 /* Gaps are supported only for loads. STEP must be a multiple of the type
2245 size. The size of the group must be a power of 2. */
2246 if (DR_IS_READ (dr)
2247 && (dr_step % type_size) == 0
2248 && groupsize > 0
2249 && exact_log2 (groupsize) != -1)
2251 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2252 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2253 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_NOTE, vect_location,
2256 "Detected single element interleaving ");
2257 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2258 dump_printf (MSG_NOTE, " step ");
2259 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2260 dump_printf (MSG_NOTE, "\n");
2263 return true;
2266 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 "not consecutive access ");
2270 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2273 if (bb_vinfo)
2275 /* Mark the statement as unvectorizable. */
2276 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2277 return true;
2280 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2281 STMT_VINFO_STRIDED_P (stmt_info) = true;
2282 return true;
2285 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2287 /* First stmt in the interleaving chain. Check the chain. */
2288 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2289 struct data_reference *data_ref = dr;
2290 unsigned int count = 1;
2291 tree prev_init = DR_INIT (data_ref);
2292 gimple *prev = stmt;
2293 HOST_WIDE_INT diff, gaps = 0;
2295 while (next)
2297 /* Skip same data-refs. In case that two or more stmts share
2298 data-ref (supported only for loads), we vectorize only the first
2299 stmt, and the rest get their vectorized loads from the first
2300 one. */
2301 if (!tree_int_cst_compare (DR_INIT (data_ref),
2302 DR_INIT (STMT_VINFO_DATA_REF (
2303 vinfo_for_stmt (next)))))
2305 if (DR_IS_WRITE (data_ref))
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 "Two store stmts share the same dr.\n");
2310 return false;
2313 if (dump_enabled_p ())
2314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2315 "Two or more load stmts share the same dr.\n");
2317 /* For load use the same data-ref load. */
2318 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2320 prev = next;
2321 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2322 continue;
2325 prev = next;
2326 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2328 /* All group members have the same STEP by construction. */
2329 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2331 /* Check that the distance between two accesses is equal to the type
2332 size. Otherwise, we have gaps. */
2333 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2334 - TREE_INT_CST_LOW (prev_init)) / type_size;
2335 if (diff != 1)
2337 /* FORNOW: SLP of accesses with gaps is not supported. */
2338 slp_impossible = true;
2339 if (DR_IS_WRITE (data_ref))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "interleaved store with gaps\n");
2344 return false;
2347 gaps += diff - 1;
2350 last_accessed_element += diff;
2352 /* Store the gap from the previous member of the group. If there is no
2353 gap in the access, GROUP_GAP is always 1. */
2354 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2356 prev_init = DR_INIT (data_ref);
2357 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2358 /* Count the number of data-refs in the chain. */
2359 count++;
2362 if (groupsize == 0)
2363 groupsize = count + gaps;
2365 if (groupsize > UINT_MAX)
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 "group is too large\n");
2370 return false;
2373 /* Check that the size of the interleaving is equal to count for stores,
2374 i.e., that there are no gaps. */
2375 if (groupsize != count
2376 && !DR_IS_READ (dr))
2378 if (dump_enabled_p ())
2379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2380 "interleaved store with gaps\n");
2381 return false;
2384 /* If there is a gap after the last load in the group it is the
2385 difference between the groupsize and the last accessed
2386 element.
2387 When there is no gap, this difference should be 0. */
2388 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2390 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2391 if (dump_enabled_p ())
2393 dump_printf_loc (MSG_NOTE, vect_location,
2394 "Detected interleaving ");
2395 if (DR_IS_READ (dr))
2396 dump_printf (MSG_NOTE, "load ");
2397 else
2398 dump_printf (MSG_NOTE, "store ");
2399 dump_printf (MSG_NOTE, "of size %u starting with ",
2400 (unsigned)groupsize);
2401 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2402 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2403 dump_printf_loc (MSG_NOTE, vect_location,
2404 "There is a gap of %u elements after the group\n",
2405 GROUP_GAP (vinfo_for_stmt (stmt)));
2408 /* SLP: create an SLP data structure for every interleaving group of
2409 stores for further analysis in vect_analyse_slp. */
2410 if (DR_IS_WRITE (dr) && !slp_impossible)
2412 if (loop_vinfo)
2413 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2414 if (bb_vinfo)
2415 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2419 return true;
2422 /* Analyze groups of accesses: check that DR belongs to a group of
2423 accesses of legal size, step, etc. Detect gaps, single element
2424 interleaving, and other special cases. Set grouped access info.
2425 Collect groups of strided stores for further use in SLP analysis. */
2427 static bool
2428 vect_analyze_group_access (struct data_reference *dr)
2430 if (!vect_analyze_group_access_1 (dr))
2432 /* Dissolve the group if present. */
2433 gimple *next;
2434 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2435 while (stmt)
2437 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2438 next = GROUP_NEXT_ELEMENT (vinfo);
2439 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2440 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2441 stmt = next;
2443 return false;
2445 return true;
2448 /* Analyze the access pattern of the data-reference DR.
2449 In case of non-consecutive accesses call vect_analyze_group_access() to
2450 analyze groups of accesses. */
2452 static bool
2453 vect_analyze_data_ref_access (struct data_reference *dr)
2455 tree step = DR_STEP (dr);
2456 tree scalar_type = TREE_TYPE (DR_REF (dr));
2457 gimple *stmt = DR_STMT (dr);
2458 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2460 struct loop *loop = NULL;
2462 if (loop_vinfo)
2463 loop = LOOP_VINFO_LOOP (loop_vinfo);
2465 if (loop_vinfo && !step)
2467 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2469 "bad data-ref access in loop\n");
2470 return false;
2473 /* Allow loads with zero step in inner-loop vectorization. */
2474 if (loop_vinfo && integer_zerop (step))
2476 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2477 if (!nested_in_vect_loop_p (loop, stmt))
2478 return DR_IS_READ (dr);
2479 /* Allow references with zero step for outer loops marked
2480 with pragma omp simd only - it guarantees absence of
2481 loop-carried dependencies between inner loop iterations. */
2482 if (!loop->force_vectorize)
2484 if (dump_enabled_p ())
2485 dump_printf_loc (MSG_NOTE, vect_location,
2486 "zero step in inner loop of nest\n");
2487 return false;
2491 if (loop && nested_in_vect_loop_p (loop, stmt))
2493 /* Interleaved accesses are not yet supported within outer-loop
2494 vectorization for references in the inner-loop. */
2495 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2497 /* For the rest of the analysis we use the outer-loop step. */
2498 step = STMT_VINFO_DR_STEP (stmt_info);
2499 if (integer_zerop (step))
2501 if (dump_enabled_p ())
2502 dump_printf_loc (MSG_NOTE, vect_location,
2503 "zero step in outer loop.\n");
2504 return DR_IS_READ (dr);
2508 /* Consecutive? */
2509 if (TREE_CODE (step) == INTEGER_CST)
2511 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2512 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2513 || (dr_step < 0
2514 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2516 /* Mark that it is not interleaving. */
2517 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2518 return true;
2522 if (loop && nested_in_vect_loop_p (loop, stmt))
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_NOTE, vect_location,
2526 "grouped access in outer loop.\n");
2527 return false;
2531 /* Assume this is a DR handled by non-constant strided load case. */
2532 if (TREE_CODE (step) != INTEGER_CST)
2533 return (STMT_VINFO_STRIDED_P (stmt_info)
2534 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2535 || vect_analyze_group_access (dr)));
2537 /* Not consecutive access - check if it's a part of interleaving group. */
2538 return vect_analyze_group_access (dr);
2543 /* A helper function used in the comparator function to sort data
2544 references. T1 and T2 are two data references to be compared.
2545 The function returns -1, 0, or 1. */
2547 static int
2548 compare_tree (tree t1, tree t2)
2550 int i, cmp;
2551 enum tree_code code;
2552 char tclass;
2554 if (t1 == t2)
2555 return 0;
2556 if (t1 == NULL)
2557 return -1;
2558 if (t2 == NULL)
2559 return 1;
2561 STRIP_NOPS (t1);
2562 STRIP_NOPS (t2);
2564 if (TREE_CODE (t1) != TREE_CODE (t2))
2565 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2567 code = TREE_CODE (t1);
2568 switch (code)
2570 /* For const values, we can just use hash values for comparisons. */
2571 case INTEGER_CST:
2572 case REAL_CST:
2573 case FIXED_CST:
2574 case STRING_CST:
2575 case COMPLEX_CST:
2576 case VECTOR_CST:
2578 hashval_t h1 = iterative_hash_expr (t1, 0);
2579 hashval_t h2 = iterative_hash_expr (t2, 0);
2580 if (h1 != h2)
2581 return h1 < h2 ? -1 : 1;
2582 break;
2585 case SSA_NAME:
2586 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2587 if (cmp != 0)
2588 return cmp;
2590 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2591 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2592 break;
2594 default:
2595 tclass = TREE_CODE_CLASS (code);
2597 /* For var-decl, we could compare their UIDs. */
2598 if (tclass == tcc_declaration)
2600 if (DECL_UID (t1) != DECL_UID (t2))
2601 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2602 break;
2605 /* For expressions with operands, compare their operands recursively. */
2606 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2608 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2609 if (cmp != 0)
2610 return cmp;
2614 return 0;
2618 /* Compare two data-references DRA and DRB to group them into chunks
2619 suitable for grouping. */
2621 static int
2622 dr_group_sort_cmp (const void *dra_, const void *drb_)
2624 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2625 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2626 int cmp;
2628 /* Stabilize sort. */
2629 if (dra == drb)
2630 return 0;
2632 /* DRs in different loops never belong to the same group. */
2633 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2634 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2635 if (loopa != loopb)
2636 return loopa->num < loopb->num ? -1 : 1;
2638 /* Ordering of DRs according to base. */
2639 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2641 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2642 if (cmp != 0)
2643 return cmp;
2646 /* And according to DR_OFFSET. */
2647 if (!dr_equal_offsets_p (dra, drb))
2649 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2650 if (cmp != 0)
2651 return cmp;
2654 /* Put reads before writes. */
2655 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2656 return DR_IS_READ (dra) ? -1 : 1;
2658 /* Then sort after access size. */
2659 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2660 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2662 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2663 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2664 if (cmp != 0)
2665 return cmp;
2668 /* And after step. */
2669 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2671 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2672 if (cmp != 0)
2673 return cmp;
2676 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2677 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2678 if (cmp == 0)
2679 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2680 return cmp;
2683 /* Function vect_analyze_data_ref_accesses.
2685 Analyze the access pattern of all the data references in the loop.
2687 FORNOW: the only access pattern that is considered vectorizable is a
2688 simple step 1 (consecutive) access.
2690 FORNOW: handle only arrays and pointer accesses. */
2692 bool
2693 vect_analyze_data_ref_accesses (vec_info *vinfo)
2695 unsigned int i;
2696 vec<data_reference_p> datarefs = vinfo->datarefs;
2697 struct data_reference *dr;
2699 if (dump_enabled_p ())
2700 dump_printf_loc (MSG_NOTE, vect_location,
2701 "=== vect_analyze_data_ref_accesses ===\n");
2703 if (datarefs.is_empty ())
2704 return true;
2706 /* Sort the array of datarefs to make building the interleaving chains
2707 linear. Don't modify the original vector's order, it is needed for
2708 determining what dependencies are reversed. */
2709 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2710 datarefs_copy.qsort (dr_group_sort_cmp);
2712 /* Build the interleaving chains. */
2713 for (i = 0; i < datarefs_copy.length () - 1;)
2715 data_reference_p dra = datarefs_copy[i];
2716 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2717 stmt_vec_info lastinfo = NULL;
2718 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2720 ++i;
2721 continue;
2723 for (i = i + 1; i < datarefs_copy.length (); ++i)
2725 data_reference_p drb = datarefs_copy[i];
2726 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2727 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2728 break;
2730 /* ??? Imperfect sorting (non-compatible types, non-modulo
2731 accesses, same accesses) can lead to a group to be artificially
2732 split here as we don't just skip over those. If it really
2733 matters we can push those to a worklist and re-iterate
2734 over them. The we can just skip ahead to the next DR here. */
2736 /* DRs in a different loop should not be put into the same
2737 interleaving group. */
2738 if (gimple_bb (DR_STMT (dra))->loop_father
2739 != gimple_bb (DR_STMT (drb))->loop_father)
2740 break;
2742 /* Check that the data-refs have same first location (except init)
2743 and they are both either store or load (not load and store,
2744 not masked loads or stores). */
2745 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2746 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2747 DR_BASE_ADDRESS (drb), 0)
2748 || !dr_equal_offsets_p (dra, drb)
2749 || !gimple_assign_single_p (DR_STMT (dra))
2750 || !gimple_assign_single_p (DR_STMT (drb)))
2751 break;
2753 /* Check that the data-refs have the same constant size. */
2754 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2755 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2756 if (!tree_fits_uhwi_p (sza)
2757 || !tree_fits_uhwi_p (szb)
2758 || !tree_int_cst_equal (sza, szb))
2759 break;
2761 /* Check that the data-refs have the same step. */
2762 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2763 break;
2765 /* Do not place the same access in the interleaving chain twice. */
2766 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2767 break;
2769 /* Check the types are compatible.
2770 ??? We don't distinguish this during sorting. */
2771 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2772 TREE_TYPE (DR_REF (drb))))
2773 break;
2775 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2776 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2777 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2778 gcc_assert (init_a <= init_b);
2780 /* If init_b == init_a + the size of the type * k, we have an
2781 interleaving, and DRA is accessed before DRB. */
2782 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2783 if (type_size_a == 0
2784 || (init_b - init_a) % type_size_a != 0)
2785 break;
2787 /* If we have a store, the accesses are adjacent. This splits
2788 groups into chunks we support (we don't support vectorization
2789 of stores with gaps). */
2790 if (!DR_IS_READ (dra)
2791 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2792 (DR_INIT (datarefs_copy[i-1]))
2793 != type_size_a))
2794 break;
2796 /* If the step (if not zero or non-constant) is greater than the
2797 difference between data-refs' inits this splits groups into
2798 suitable sizes. */
2799 if (tree_fits_shwi_p (DR_STEP (dra)))
2801 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2802 if (step != 0 && step <= (init_b - init_a))
2803 break;
2806 if (dump_enabled_p ())
2808 dump_printf_loc (MSG_NOTE, vect_location,
2809 "Detected interleaving ");
2810 if (DR_IS_READ (dra))
2811 dump_printf (MSG_NOTE, "load ");
2812 else
2813 dump_printf (MSG_NOTE, "store ");
2814 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2815 dump_printf (MSG_NOTE, " and ");
2816 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2817 dump_printf (MSG_NOTE, "\n");
2820 /* Link the found element into the group list. */
2821 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2823 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2824 lastinfo = stmtinfo_a;
2826 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2827 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2828 lastinfo = stmtinfo_b;
2832 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2833 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2834 && !vect_analyze_data_ref_access (dr))
2836 if (dump_enabled_p ())
2837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2838 "not vectorized: complicated access pattern.\n");
2840 if (is_a <bb_vec_info> (vinfo))
2842 /* Mark the statement as not vectorizable. */
2843 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2844 continue;
2846 else
2848 datarefs_copy.release ();
2849 return false;
2853 datarefs_copy.release ();
2854 return true;
2858 /* Operator == between two dr_with_seg_len objects.
2860 This equality operator is used to make sure two data refs
2861 are the same one so that we will consider to combine the
2862 aliasing checks of those two pairs of data dependent data
2863 refs. */
2865 static bool
2866 operator == (const dr_with_seg_len& d1,
2867 const dr_with_seg_len& d2)
2869 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2870 DR_BASE_ADDRESS (d2.dr), 0)
2871 && compare_tree (d1.offset, d2.offset) == 0
2872 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2875 /* Function comp_dr_with_seg_len_pair.
2877 Comparison function for sorting objects of dr_with_seg_len_pair_t
2878 so that we can combine aliasing checks in one scan. */
2880 static int
2881 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2883 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2884 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2886 const dr_with_seg_len &p11 = p1->first,
2887 &p12 = p1->second,
2888 &p21 = p2->first,
2889 &p22 = p2->second;
2891 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2892 if a and c have the same basic address snd step, and b and d have the same
2893 address and step. Therefore, if any a&c or b&d don't have the same address
2894 and step, we don't care the order of those two pairs after sorting. */
2895 int comp_res;
2897 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2898 DR_BASE_ADDRESS (p21.dr))) != 0)
2899 return comp_res;
2900 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2901 DR_BASE_ADDRESS (p22.dr))) != 0)
2902 return comp_res;
2903 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2904 return comp_res;
2905 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2906 return comp_res;
2907 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2908 return comp_res;
2909 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2910 return comp_res;
2912 return 0;
2915 /* Function vect_vfa_segment_size.
2917 Create an expression that computes the size of segment
2918 that will be accessed for a data reference. The functions takes into
2919 account that realignment loads may access one more vector.
2921 Input:
2922 DR: The data reference.
2923 LENGTH_FACTOR: segment length to consider.
2925 Return an expression whose value is the size of segment which will be
2926 accessed by DR. */
2928 static tree
2929 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2931 tree segment_length;
2933 if (integer_zerop (DR_STEP (dr)))
2934 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2935 else
2936 segment_length = size_binop (MULT_EXPR,
2937 fold_convert (sizetype, DR_STEP (dr)),
2938 fold_convert (sizetype, length_factor));
2940 if (vect_supportable_dr_alignment (dr, false)
2941 == dr_explicit_realign_optimized)
2943 tree vector_size = TYPE_SIZE_UNIT
2944 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2946 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2948 return segment_length;
2951 /* Function vect_prune_runtime_alias_test_list.
2953 Prune a list of ddrs to be tested at run-time by versioning for alias.
2954 Merge several alias checks into one if possible.
2955 Return FALSE if resulting list of ddrs is longer then allowed by
2956 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2958 bool
2959 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2961 vec<ddr_p> may_alias_ddrs =
2962 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2963 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2964 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2965 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2966 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2968 ddr_p ddr;
2969 unsigned int i;
2970 tree length_factor;
2972 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_NOTE, vect_location,
2974 "=== vect_prune_runtime_alias_test_list ===\n");
2976 if (may_alias_ddrs.is_empty ())
2977 return true;
2979 /* Basically, for each pair of dependent data refs store_ptr_0
2980 and load_ptr_0, we create an expression:
2982 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2983 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2985 for aliasing checks. However, in some cases we can decrease
2986 the number of checks by combining two checks into one. For
2987 example, suppose we have another pair of data refs store_ptr_0
2988 and load_ptr_1, and if the following condition is satisfied:
2990 load_ptr_0 < load_ptr_1 &&
2991 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2993 (this condition means, in each iteration of vectorized loop,
2994 the accessed memory of store_ptr_0 cannot be between the memory
2995 of load_ptr_0 and load_ptr_1.)
2997 we then can use only the following expression to finish the
2998 alising checks between store_ptr_0 & load_ptr_0 and
2999 store_ptr_0 & load_ptr_1:
3001 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3002 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3004 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3005 same basic address. */
3007 comp_alias_ddrs.create (may_alias_ddrs.length ());
3009 /* First, we collect all data ref pairs for aliasing checks. */
3010 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3012 struct data_reference *dr_a, *dr_b;
3013 gimple *dr_group_first_a, *dr_group_first_b;
3014 tree segment_length_a, segment_length_b;
3015 gimple *stmt_a, *stmt_b;
3017 dr_a = DDR_A (ddr);
3018 stmt_a = DR_STMT (DDR_A (ddr));
3019 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3020 if (dr_group_first_a)
3022 stmt_a = dr_group_first_a;
3023 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3026 dr_b = DDR_B (ddr);
3027 stmt_b = DR_STMT (DDR_B (ddr));
3028 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3029 if (dr_group_first_b)
3031 stmt_b = dr_group_first_b;
3032 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3035 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3036 length_factor = scalar_loop_iters;
3037 else
3038 length_factor = size_int (vect_factor);
3039 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3040 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3042 dr_with_seg_len_pair_t dr_with_seg_len_pair
3043 (dr_with_seg_len (dr_a, segment_length_a),
3044 dr_with_seg_len (dr_b, segment_length_b));
3046 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3047 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3049 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3052 /* Second, we sort the collected data ref pairs so that we can scan
3053 them once to combine all possible aliasing checks. */
3054 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3056 /* Third, we scan the sorted dr pairs and check if we can combine
3057 alias checks of two neighboring dr pairs. */
3058 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3060 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3061 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3062 *dr_b1 = &comp_alias_ddrs[i-1].second,
3063 *dr_a2 = &comp_alias_ddrs[i].first,
3064 *dr_b2 = &comp_alias_ddrs[i].second;
3066 /* Remove duplicate data ref pairs. */
3067 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3069 if (dump_enabled_p ())
3071 dump_printf_loc (MSG_NOTE, vect_location,
3072 "found equal ranges ");
3073 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3074 DR_REF (dr_a1->dr));
3075 dump_printf (MSG_NOTE, ", ");
3076 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3077 DR_REF (dr_b1->dr));
3078 dump_printf (MSG_NOTE, " and ");
3079 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3080 DR_REF (dr_a2->dr));
3081 dump_printf (MSG_NOTE, ", ");
3082 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3083 DR_REF (dr_b2->dr));
3084 dump_printf (MSG_NOTE, "\n");
3087 comp_alias_ddrs.ordered_remove (i--);
3088 continue;
3091 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3093 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3094 and DR_A1 and DR_A2 are two consecutive memrefs. */
3095 if (*dr_a1 == *dr_a2)
3097 std::swap (dr_a1, dr_b1);
3098 std::swap (dr_a2, dr_b2);
3101 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3102 DR_BASE_ADDRESS (dr_a2->dr),
3104 || !tree_fits_shwi_p (dr_a1->offset)
3105 || !tree_fits_shwi_p (dr_a2->offset))
3106 continue;
3108 /* Make sure dr_a1 starts left of dr_a2. */
3109 if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset))
3110 std::swap (*dr_a1, *dr_a2);
3112 unsigned HOST_WIDE_INT diff
3113 = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset);
3116 bool do_remove = false;
3118 /* If the left segment does not extend beyond the start of the
3119 right segment the new segment length is that of the right
3120 plus the segment distance. */
3121 if (tree_fits_uhwi_p (dr_a1->seg_len)
3122 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3124 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3125 size_int (diff));
3126 do_remove = true;
3128 /* Generally the new segment length is the maximum of the
3129 left segment size and the right segment size plus the distance.
3130 ??? We can also build tree MAX_EXPR here but it's not clear this
3131 is profitable. */
3132 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3133 && tree_fits_uhwi_p (dr_a2->seg_len))
3135 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3136 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3137 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3138 do_remove = true;
3140 /* Now we check if the following condition is satisfied:
3142 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3144 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3145 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3146 have to make a best estimation. We can get the minimum value
3147 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3148 then either of the following two conditions can guarantee the
3149 one above:
3151 1: DIFF <= MIN_SEG_LEN_B
3152 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3153 else
3155 unsigned HOST_WIDE_INT min_seg_len_b
3156 = (tree_fits_uhwi_p (dr_b1->seg_len)
3157 ? tree_to_uhwi (dr_b1->seg_len)
3158 : vect_factor);
3160 if (diff <= min_seg_len_b
3161 || (tree_fits_uhwi_p (dr_a1->seg_len)
3162 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3164 dr_a1->seg_len = size_binop (PLUS_EXPR,
3165 dr_a2->seg_len, size_int (diff));
3166 do_remove = true;
3170 if (do_remove)
3172 if (dump_enabled_p ())
3174 dump_printf_loc (MSG_NOTE, vect_location,
3175 "merging ranges for ");
3176 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3177 dump_printf (MSG_NOTE, ", ");
3178 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3179 dump_printf (MSG_NOTE, " and ");
3180 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3181 dump_printf (MSG_NOTE, ", ");
3182 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3183 dump_printf (MSG_NOTE, "\n");
3185 comp_alias_ddrs.ordered_remove (i--);
3190 dump_printf_loc (MSG_NOTE, vect_location,
3191 "improved number of alias checks from %d to %d\n",
3192 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3193 if ((int) comp_alias_ddrs.length () >
3194 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3195 return false;
3197 return true;
3200 /* Check whether a non-affine read or write in stmt is suitable for gather load
3201 or scatter store and if so, return a builtin decl for that operation. */
3203 tree
3204 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3205 tree *offp, int *scalep)
3207 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3208 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3209 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3210 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3211 tree offtype = NULL_TREE;
3212 tree decl, base, off;
3213 machine_mode pmode;
3214 int punsignedp, reversep, pvolatilep = 0;
3216 base = DR_REF (dr);
3217 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3218 see if we can use the def stmt of the address. */
3219 if (is_gimple_call (stmt)
3220 && gimple_call_internal_p (stmt)
3221 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3222 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3223 && TREE_CODE (base) == MEM_REF
3224 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3225 && integer_zerop (TREE_OPERAND (base, 1))
3226 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3228 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3229 if (is_gimple_assign (def_stmt)
3230 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3231 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3234 /* The gather and scatter builtins need address of the form
3235 loop_invariant + vector * {1, 2, 4, 8}
3237 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3238 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3239 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3240 multiplications and additions in it. To get a vector, we need
3241 a single SSA_NAME that will be defined in the loop and will
3242 contain everything that is not loop invariant and that can be
3243 vectorized. The following code attempts to find such a preexistng
3244 SSA_NAME OFF and put the loop invariants into a tree BASE
3245 that can be gimplified before the loop. */
3246 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3247 &punsignedp, &reversep, &pvolatilep, false);
3248 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3250 if (TREE_CODE (base) == MEM_REF)
3252 if (!integer_zerop (TREE_OPERAND (base, 1)))
3254 if (off == NULL_TREE)
3256 offset_int moff = mem_ref_offset (base);
3257 off = wide_int_to_tree (sizetype, moff);
3259 else
3260 off = size_binop (PLUS_EXPR, off,
3261 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3263 base = TREE_OPERAND (base, 0);
3265 else
3266 base = build_fold_addr_expr (base);
3268 if (off == NULL_TREE)
3269 off = size_zero_node;
3271 /* If base is not loop invariant, either off is 0, then we start with just
3272 the constant offset in the loop invariant BASE and continue with base
3273 as OFF, otherwise give up.
3274 We could handle that case by gimplifying the addition of base + off
3275 into some SSA_NAME and use that as off, but for now punt. */
3276 if (!expr_invariant_in_loop_p (loop, base))
3278 if (!integer_zerop (off))
3279 return NULL_TREE;
3280 off = base;
3281 base = size_int (pbitpos / BITS_PER_UNIT);
3283 /* Otherwise put base + constant offset into the loop invariant BASE
3284 and continue with OFF. */
3285 else
3287 base = fold_convert (sizetype, base);
3288 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3291 /* OFF at this point may be either a SSA_NAME or some tree expression
3292 from get_inner_reference. Try to peel off loop invariants from it
3293 into BASE as long as possible. */
3294 STRIP_NOPS (off);
3295 while (offtype == NULL_TREE)
3297 enum tree_code code;
3298 tree op0, op1, add = NULL_TREE;
3300 if (TREE_CODE (off) == SSA_NAME)
3302 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3304 if (expr_invariant_in_loop_p (loop, off))
3305 return NULL_TREE;
3307 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3308 break;
3310 op0 = gimple_assign_rhs1 (def_stmt);
3311 code = gimple_assign_rhs_code (def_stmt);
3312 op1 = gimple_assign_rhs2 (def_stmt);
3314 else
3316 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3317 return NULL_TREE;
3318 code = TREE_CODE (off);
3319 extract_ops_from_tree (off, &code, &op0, &op1);
3321 switch (code)
3323 case POINTER_PLUS_EXPR:
3324 case PLUS_EXPR:
3325 if (expr_invariant_in_loop_p (loop, op0))
3327 add = op0;
3328 off = op1;
3329 do_add:
3330 add = fold_convert (sizetype, add);
3331 if (scale != 1)
3332 add = size_binop (MULT_EXPR, add, size_int (scale));
3333 base = size_binop (PLUS_EXPR, base, add);
3334 continue;
3336 if (expr_invariant_in_loop_p (loop, op1))
3338 add = op1;
3339 off = op0;
3340 goto do_add;
3342 break;
3343 case MINUS_EXPR:
3344 if (expr_invariant_in_loop_p (loop, op1))
3346 add = fold_convert (sizetype, op1);
3347 add = size_binop (MINUS_EXPR, size_zero_node, add);
3348 off = op0;
3349 goto do_add;
3351 break;
3352 case MULT_EXPR:
3353 if (scale == 1 && tree_fits_shwi_p (op1))
3355 scale = tree_to_shwi (op1);
3356 off = op0;
3357 continue;
3359 break;
3360 case SSA_NAME:
3361 off = op0;
3362 continue;
3363 CASE_CONVERT:
3364 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3365 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3366 break;
3367 if (TYPE_PRECISION (TREE_TYPE (op0))
3368 == TYPE_PRECISION (TREE_TYPE (off)))
3370 off = op0;
3371 continue;
3373 if (TYPE_PRECISION (TREE_TYPE (op0))
3374 < TYPE_PRECISION (TREE_TYPE (off)))
3376 off = op0;
3377 offtype = TREE_TYPE (off);
3378 STRIP_NOPS (off);
3379 continue;
3381 break;
3382 default:
3383 break;
3385 break;
3388 /* If at the end OFF still isn't a SSA_NAME or isn't
3389 defined in the loop, punt. */
3390 if (TREE_CODE (off) != SSA_NAME
3391 || expr_invariant_in_loop_p (loop, off))
3392 return NULL_TREE;
3394 if (offtype == NULL_TREE)
3395 offtype = TREE_TYPE (off);
3397 if (DR_IS_READ (dr))
3398 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3399 offtype, scale);
3400 else
3401 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3402 offtype, scale);
3404 if (decl == NULL_TREE)
3405 return NULL_TREE;
3407 if (basep)
3408 *basep = base;
3409 if (offp)
3410 *offp = off;
3411 if (scalep)
3412 *scalep = scale;
3413 return decl;
3416 /* Function vect_analyze_data_refs.
3418 Find all the data references in the loop or basic block.
3420 The general structure of the analysis of data refs in the vectorizer is as
3421 follows:
3422 1- vect_analyze_data_refs(loop/bb): call
3423 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3424 in the loop/bb and their dependences.
3425 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3426 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3427 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3431 bool
3432 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3434 struct loop *loop = NULL;
3435 unsigned int i;
3436 struct data_reference *dr;
3437 tree scalar_type;
3439 if (dump_enabled_p ())
3440 dump_printf_loc (MSG_NOTE, vect_location,
3441 "=== vect_analyze_data_refs ===\n");
3443 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3444 loop = LOOP_VINFO_LOOP (loop_vinfo);
3446 /* Go through the data-refs, check that the analysis succeeded. Update
3447 pointer from stmt_vec_info struct to DR and vectype. */
3449 vec<data_reference_p> datarefs = vinfo->datarefs;
3450 FOR_EACH_VEC_ELT (datarefs, i, dr)
3452 gimple *stmt;
3453 stmt_vec_info stmt_info;
3454 tree base, offset, init;
3455 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3456 bool simd_lane_access = false;
3457 int vf;
3459 again:
3460 if (!dr || !DR_REF (dr))
3462 if (dump_enabled_p ())
3463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3464 "not vectorized: unhandled data-ref\n");
3465 return false;
3468 stmt = DR_STMT (dr);
3469 stmt_info = vinfo_for_stmt (stmt);
3471 /* Discard clobbers from the dataref vector. We will remove
3472 clobber stmts during vectorization. */
3473 if (gimple_clobber_p (stmt))
3475 free_data_ref (dr);
3476 if (i == datarefs.length () - 1)
3478 datarefs.pop ();
3479 break;
3481 datarefs.ordered_remove (i);
3482 dr = datarefs[i];
3483 goto again;
3486 /* Check that analysis of the data-ref succeeded. */
3487 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3488 || !DR_STEP (dr))
3490 bool maybe_gather
3491 = DR_IS_READ (dr)
3492 && !TREE_THIS_VOLATILE (DR_REF (dr))
3493 && targetm.vectorize.builtin_gather != NULL;
3494 bool maybe_scatter
3495 = DR_IS_WRITE (dr)
3496 && !TREE_THIS_VOLATILE (DR_REF (dr))
3497 && targetm.vectorize.builtin_scatter != NULL;
3498 bool maybe_simd_lane_access
3499 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3501 /* If target supports vector gather loads or scatter stores, or if
3502 this might be a SIMD lane access, see if they can't be used. */
3503 if (is_a <loop_vec_info> (vinfo)
3504 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3505 && !nested_in_vect_loop_p (loop, stmt))
3507 struct data_reference *newdr
3508 = create_data_ref (NULL, loop_containing_stmt (stmt),
3509 DR_REF (dr), stmt, maybe_scatter ? false : true);
3510 gcc_assert (newdr != NULL && DR_REF (newdr));
3511 if (DR_BASE_ADDRESS (newdr)
3512 && DR_OFFSET (newdr)
3513 && DR_INIT (newdr)
3514 && DR_STEP (newdr)
3515 && integer_zerop (DR_STEP (newdr)))
3517 if (maybe_simd_lane_access)
3519 tree off = DR_OFFSET (newdr);
3520 STRIP_NOPS (off);
3521 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3522 && TREE_CODE (off) == MULT_EXPR
3523 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3525 tree step = TREE_OPERAND (off, 1);
3526 off = TREE_OPERAND (off, 0);
3527 STRIP_NOPS (off);
3528 if (CONVERT_EXPR_P (off)
3529 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3530 0)))
3531 < TYPE_PRECISION (TREE_TYPE (off)))
3532 off = TREE_OPERAND (off, 0);
3533 if (TREE_CODE (off) == SSA_NAME)
3535 gimple *def = SSA_NAME_DEF_STMT (off);
3536 tree reft = TREE_TYPE (DR_REF (newdr));
3537 if (is_gimple_call (def)
3538 && gimple_call_internal_p (def)
3539 && (gimple_call_internal_fn (def)
3540 == IFN_GOMP_SIMD_LANE))
3542 tree arg = gimple_call_arg (def, 0);
3543 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3544 arg = SSA_NAME_VAR (arg);
3545 if (arg == loop->simduid
3546 /* For now. */
3547 && tree_int_cst_equal
3548 (TYPE_SIZE_UNIT (reft),
3549 step))
3551 DR_OFFSET (newdr) = ssize_int (0);
3552 DR_STEP (newdr) = step;
3553 DR_ALIGNED_TO (newdr)
3554 = size_int (BIGGEST_ALIGNMENT);
3555 dr = newdr;
3556 simd_lane_access = true;
3562 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3564 dr = newdr;
3565 if (maybe_gather)
3566 gatherscatter = GATHER;
3567 else
3568 gatherscatter = SCATTER;
3571 if (gatherscatter == SG_NONE && !simd_lane_access)
3572 free_data_ref (newdr);
3575 if (gatherscatter == SG_NONE && !simd_lane_access)
3577 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3580 "not vectorized: data ref analysis "
3581 "failed ");
3582 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3583 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3586 if (is_a <bb_vec_info> (vinfo))
3587 break;
3589 return false;
3593 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3595 if (dump_enabled_p ())
3596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3597 "not vectorized: base addr of dr is a "
3598 "constant\n");
3600 if (is_a <bb_vec_info> (vinfo))
3601 break;
3603 if (gatherscatter != SG_NONE || simd_lane_access)
3604 free_data_ref (dr);
3605 return false;
3608 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3610 if (dump_enabled_p ())
3612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3613 "not vectorized: volatile type ");
3614 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3615 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3618 if (is_a <bb_vec_info> (vinfo))
3619 break;
3621 return false;
3624 if (stmt_can_throw_internal (stmt))
3626 if (dump_enabled_p ())
3628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3629 "not vectorized: statement can throw an "
3630 "exception ");
3631 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3632 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3635 if (is_a <bb_vec_info> (vinfo))
3636 break;
3638 if (gatherscatter != SG_NONE || simd_lane_access)
3639 free_data_ref (dr);
3640 return false;
3643 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3644 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3646 if (dump_enabled_p ())
3648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3649 "not vectorized: statement is bitfield "
3650 "access ");
3651 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3652 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3655 if (is_a <bb_vec_info> (vinfo))
3656 break;
3658 if (gatherscatter != SG_NONE || simd_lane_access)
3659 free_data_ref (dr);
3660 return false;
3663 base = unshare_expr (DR_BASE_ADDRESS (dr));
3664 offset = unshare_expr (DR_OFFSET (dr));
3665 init = unshare_expr (DR_INIT (dr));
3667 if (is_gimple_call (stmt)
3668 && (!gimple_call_internal_p (stmt)
3669 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3670 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3672 if (dump_enabled_p ())
3674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3675 "not vectorized: dr in a call ");
3676 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3677 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3680 if (is_a <bb_vec_info> (vinfo))
3681 break;
3683 if (gatherscatter != SG_NONE || simd_lane_access)
3684 free_data_ref (dr);
3685 return false;
3688 /* Update DR field in stmt_vec_info struct. */
3690 /* If the dataref is in an inner-loop of the loop that is considered for
3691 for vectorization, we also want to analyze the access relative to
3692 the outer-loop (DR contains information only relative to the
3693 inner-most enclosing loop). We do that by building a reference to the
3694 first location accessed by the inner-loop, and analyze it relative to
3695 the outer-loop. */
3696 if (loop && nested_in_vect_loop_p (loop, stmt))
3698 tree outer_step, outer_base, outer_init;
3699 HOST_WIDE_INT pbitsize, pbitpos;
3700 tree poffset;
3701 machine_mode pmode;
3702 int punsignedp, preversep, pvolatilep;
3703 affine_iv base_iv, offset_iv;
3704 tree dinit;
3706 /* Build a reference to the first location accessed by the
3707 inner-loop: *(BASE+INIT). (The first location is actually
3708 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3709 tree inner_base = build_fold_indirect_ref
3710 (fold_build_pointer_plus (base, init));
3712 if (dump_enabled_p ())
3714 dump_printf_loc (MSG_NOTE, vect_location,
3715 "analyze in outer-loop: ");
3716 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3717 dump_printf (MSG_NOTE, "\n");
3720 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3721 &poffset, &pmode, &punsignedp,
3722 &preversep, &pvolatilep, false);
3723 gcc_assert (outer_base != NULL_TREE);
3725 if (pbitpos % BITS_PER_UNIT != 0)
3727 if (dump_enabled_p ())
3728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3729 "failed: bit offset alignment.\n");
3730 return false;
3733 if (preversep)
3735 if (dump_enabled_p ())
3736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3737 "failed: reverse storage order.\n");
3738 return false;
3741 outer_base = build_fold_addr_expr (outer_base);
3742 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3743 &base_iv, false))
3745 if (dump_enabled_p ())
3746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3747 "failed: evolution of base is not affine.\n");
3748 return false;
3751 if (offset)
3753 if (poffset)
3754 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3755 poffset);
3756 else
3757 poffset = offset;
3760 if (!poffset)
3762 offset_iv.base = ssize_int (0);
3763 offset_iv.step = ssize_int (0);
3765 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3766 &offset_iv, false))
3768 if (dump_enabled_p ())
3769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3770 "evolution of offset is not affine.\n");
3771 return false;
3774 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3775 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3776 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3777 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3778 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3780 outer_step = size_binop (PLUS_EXPR,
3781 fold_convert (ssizetype, base_iv.step),
3782 fold_convert (ssizetype, offset_iv.step));
3784 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3785 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3786 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3787 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3788 STMT_VINFO_DR_OFFSET (stmt_info) =
3789 fold_convert (ssizetype, offset_iv.base);
3790 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3791 size_int (highest_pow2_factor (offset_iv.base));
3793 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 "\touter base_address: ");
3797 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3798 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3799 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3800 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3801 STMT_VINFO_DR_OFFSET (stmt_info));
3802 dump_printf (MSG_NOTE,
3803 "\n\touter constant offset from base address: ");
3804 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3805 STMT_VINFO_DR_INIT (stmt_info));
3806 dump_printf (MSG_NOTE, "\n\touter step: ");
3807 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3808 STMT_VINFO_DR_STEP (stmt_info));
3809 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3810 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3811 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3812 dump_printf (MSG_NOTE, "\n");
3816 if (STMT_VINFO_DATA_REF (stmt_info))
3818 if (dump_enabled_p ())
3820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3821 "not vectorized: more than one data ref "
3822 "in stmt: ");
3823 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3824 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3827 if (is_a <bb_vec_info> (vinfo))
3828 break;
3830 if (gatherscatter != SG_NONE || simd_lane_access)
3831 free_data_ref (dr);
3832 return false;
3835 STMT_VINFO_DATA_REF (stmt_info) = dr;
3836 if (simd_lane_access)
3838 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3839 free_data_ref (datarefs[i]);
3840 datarefs[i] = dr;
3843 /* Set vectype for STMT. */
3844 scalar_type = TREE_TYPE (DR_REF (dr));
3845 STMT_VINFO_VECTYPE (stmt_info)
3846 = get_vectype_for_scalar_type (scalar_type);
3847 if (!STMT_VINFO_VECTYPE (stmt_info))
3849 if (dump_enabled_p ())
3851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3852 "not vectorized: no vectype for stmt: ");
3853 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3854 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3855 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3856 scalar_type);
3857 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3860 if (is_a <bb_vec_info> (vinfo))
3862 /* No vector type is fine, the ref can still participate
3863 in dependence analysis, we just can't vectorize it. */
3864 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3865 continue;
3868 if (gatherscatter != SG_NONE || simd_lane_access)
3870 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3871 if (gatherscatter != SG_NONE)
3872 free_data_ref (dr);
3874 return false;
3876 else
3878 if (dump_enabled_p ())
3880 dump_printf_loc (MSG_NOTE, vect_location,
3881 "got vectype for stmt: ");
3882 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3883 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3884 STMT_VINFO_VECTYPE (stmt_info));
3885 dump_printf (MSG_NOTE, "\n");
3889 /* Adjust the minimal vectorization factor according to the
3890 vector type. */
3891 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3892 if (vf > *min_vf)
3893 *min_vf = vf;
3895 if (gatherscatter != SG_NONE)
3897 tree off;
3898 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3899 NULL, &off, NULL)
3900 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3902 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3903 free_data_ref (dr);
3904 if (dump_enabled_p ())
3906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3907 (gatherscatter == GATHER) ?
3908 "not vectorized: not suitable for gather "
3909 "load " :
3910 "not vectorized: not suitable for scatter "
3911 "store ");
3912 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3913 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3915 return false;
3918 free_data_ref (datarefs[i]);
3919 datarefs[i] = dr;
3920 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3923 else if (is_a <loop_vec_info> (vinfo)
3924 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3926 if (nested_in_vect_loop_p (loop, stmt))
3928 if (dump_enabled_p ())
3930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3931 "not vectorized: not suitable for strided "
3932 "load ");
3933 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3934 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3936 return false;
3938 STMT_VINFO_STRIDED_P (stmt_info) = true;
3942 /* If we stopped analysis at the first dataref we could not analyze
3943 when trying to vectorize a basic-block mark the rest of the datarefs
3944 as not vectorizable and truncate the vector of datarefs. That
3945 avoids spending useless time in analyzing their dependence. */
3946 if (i != datarefs.length ())
3948 gcc_assert (is_a <bb_vec_info> (vinfo));
3949 for (unsigned j = i; j < datarefs.length (); ++j)
3951 data_reference_p dr = datarefs[j];
3952 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3953 free_data_ref (dr);
3955 datarefs.truncate (i);
3958 return true;
3962 /* Function vect_get_new_vect_var.
3964 Returns a name for a new variable. The current naming scheme appends the
3965 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3966 the name of vectorizer generated variables, and appends that to NAME if
3967 provided. */
3969 tree
3970 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3972 const char *prefix;
3973 tree new_vect_var;
3975 switch (var_kind)
3977 case vect_simple_var:
3978 prefix = "vect";
3979 break;
3980 case vect_scalar_var:
3981 prefix = "stmp";
3982 break;
3983 case vect_mask_var:
3984 prefix = "mask";
3985 break;
3986 case vect_pointer_var:
3987 prefix = "vectp";
3988 break;
3989 default:
3990 gcc_unreachable ();
3993 if (name)
3995 char* tmp = concat (prefix, "_", name, NULL);
3996 new_vect_var = create_tmp_reg (type, tmp);
3997 free (tmp);
3999 else
4000 new_vect_var = create_tmp_reg (type, prefix);
4002 return new_vect_var;
4005 /* Like vect_get_new_vect_var but return an SSA name. */
4007 tree
4008 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4010 const char *prefix;
4011 tree new_vect_var;
4013 switch (var_kind)
4015 case vect_simple_var:
4016 prefix = "vect";
4017 break;
4018 case vect_scalar_var:
4019 prefix = "stmp";
4020 break;
4021 case vect_pointer_var:
4022 prefix = "vectp";
4023 break;
4024 default:
4025 gcc_unreachable ();
4028 if (name)
4030 char* tmp = concat (prefix, "_", name, NULL);
4031 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4032 free (tmp);
4034 else
4035 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4037 return new_vect_var;
4040 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4042 static void
4043 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4044 stmt_vec_info stmt_info)
4046 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4047 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4048 int misalign = DR_MISALIGNMENT (dr);
4049 if (misalign == -1)
4050 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4051 else
4052 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4055 /* Function vect_create_addr_base_for_vector_ref.
4057 Create an expression that computes the address of the first memory location
4058 that will be accessed for a data reference.
4060 Input:
4061 STMT: The statement containing the data reference.
4062 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4063 OFFSET: Optional. If supplied, it is be added to the initial address.
4064 LOOP: Specify relative to which loop-nest should the address be computed.
4065 For example, when the dataref is in an inner-loop nested in an
4066 outer-loop that is now being vectorized, LOOP can be either the
4067 outer-loop, or the inner-loop. The first memory location accessed
4068 by the following dataref ('in' points to short):
4070 for (i=0; i<N; i++)
4071 for (j=0; j<M; j++)
4072 s += in[i+j]
4074 is as follows:
4075 if LOOP=i_loop: &in (relative to i_loop)
4076 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4077 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4078 initial address. Unlike OFFSET, which is number of elements to
4079 be added, BYTE_OFFSET is measured in bytes.
4081 Output:
4082 1. Return an SSA_NAME whose value is the address of the memory location of
4083 the first vector of the data reference.
4084 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4085 these statement(s) which define the returned SSA_NAME.
4087 FORNOW: We are only handling array accesses with step 1. */
4089 tree
4090 vect_create_addr_base_for_vector_ref (gimple *stmt,
4091 gimple_seq *new_stmt_list,
4092 tree offset,
4093 struct loop *loop,
4094 tree byte_offset)
4096 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4097 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4098 tree data_ref_base;
4099 const char *base_name;
4100 tree addr_base;
4101 tree dest;
4102 gimple_seq seq = NULL;
4103 tree base_offset;
4104 tree init;
4105 tree vect_ptr_type;
4106 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4107 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4109 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4111 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4113 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4115 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4116 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4117 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4119 else
4121 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4122 base_offset = unshare_expr (DR_OFFSET (dr));
4123 init = unshare_expr (DR_INIT (dr));
4126 if (loop_vinfo)
4127 base_name = get_name (data_ref_base);
4128 else
4130 base_offset = ssize_int (0);
4131 init = ssize_int (0);
4132 base_name = get_name (DR_REF (dr));
4135 /* Create base_offset */
4136 base_offset = size_binop (PLUS_EXPR,
4137 fold_convert (sizetype, base_offset),
4138 fold_convert (sizetype, init));
4140 if (offset)
4142 offset = fold_build2 (MULT_EXPR, sizetype,
4143 fold_convert (sizetype, offset), step);
4144 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4145 base_offset, offset);
4147 if (byte_offset)
4149 byte_offset = fold_convert (sizetype, byte_offset);
4150 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4151 base_offset, byte_offset);
4154 /* base + base_offset */
4155 if (loop_vinfo)
4156 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4157 else
4159 addr_base = build1 (ADDR_EXPR,
4160 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4161 unshare_expr (DR_REF (dr)));
4164 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4165 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4166 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4167 gimple_seq_add_seq (new_stmt_list, seq);
4169 if (DR_PTR_INFO (dr)
4170 && TREE_CODE (addr_base) == SSA_NAME
4171 && !SSA_NAME_PTR_INFO (addr_base))
4173 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4174 if (offset || byte_offset)
4175 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4178 if (dump_enabled_p ())
4180 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4181 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4182 dump_printf (MSG_NOTE, "\n");
4185 return addr_base;
4189 /* Function vect_create_data_ref_ptr.
4191 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4192 location accessed in the loop by STMT, along with the def-use update
4193 chain to appropriately advance the pointer through the loop iterations.
4194 Also set aliasing information for the pointer. This pointer is used by
4195 the callers to this function to create a memory reference expression for
4196 vector load/store access.
4198 Input:
4199 1. STMT: a stmt that references memory. Expected to be of the form
4200 GIMPLE_ASSIGN <name, data-ref> or
4201 GIMPLE_ASSIGN <data-ref, name>.
4202 2. AGGR_TYPE: the type of the reference, which should be either a vector
4203 or an array.
4204 3. AT_LOOP: the loop where the vector memref is to be created.
4205 4. OFFSET (optional): an offset to be added to the initial address accessed
4206 by the data-ref in STMT.
4207 5. BSI: location where the new stmts are to be placed if there is no loop
4208 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4209 pointing to the initial address.
4210 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4211 to the initial address accessed by the data-ref in STMT. This is
4212 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4213 in bytes.
4215 Output:
4216 1. Declare a new ptr to vector_type, and have it point to the base of the
4217 data reference (initial addressed accessed by the data reference).
4218 For example, for vector of type V8HI, the following code is generated:
4220 v8hi *ap;
4221 ap = (v8hi *)initial_address;
4223 if OFFSET is not supplied:
4224 initial_address = &a[init];
4225 if OFFSET is supplied:
4226 initial_address = &a[init + OFFSET];
4227 if BYTE_OFFSET is supplied:
4228 initial_address = &a[init] + BYTE_OFFSET;
4230 Return the initial_address in INITIAL_ADDRESS.
4232 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4233 update the pointer in each iteration of the loop.
4235 Return the increment stmt that updates the pointer in PTR_INCR.
4237 3. Set INV_P to true if the access pattern of the data reference in the
4238 vectorized loop is invariant. Set it to false otherwise.
4240 4. Return the pointer. */
4242 tree
4243 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4244 tree offset, tree *initial_address,
4245 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4246 bool only_init, bool *inv_p, tree byte_offset)
4248 const char *base_name;
4249 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4250 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4251 struct loop *loop = NULL;
4252 bool nested_in_vect_loop = false;
4253 struct loop *containing_loop = NULL;
4254 tree aggr_ptr_type;
4255 tree aggr_ptr;
4256 tree new_temp;
4257 gimple_seq new_stmt_list = NULL;
4258 edge pe = NULL;
4259 basic_block new_bb;
4260 tree aggr_ptr_init;
4261 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4262 tree aptr;
4263 gimple_stmt_iterator incr_gsi;
4264 bool insert_after;
4265 tree indx_before_incr, indx_after_incr;
4266 gimple *incr;
4267 tree step;
4268 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4270 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4271 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4273 if (loop_vinfo)
4275 loop = LOOP_VINFO_LOOP (loop_vinfo);
4276 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4277 containing_loop = (gimple_bb (stmt))->loop_father;
4278 pe = loop_preheader_edge (loop);
4280 else
4282 gcc_assert (bb_vinfo);
4283 only_init = true;
4284 *ptr_incr = NULL;
4287 /* Check the step (evolution) of the load in LOOP, and record
4288 whether it's invariant. */
4289 if (nested_in_vect_loop)
4290 step = STMT_VINFO_DR_STEP (stmt_info);
4291 else
4292 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4294 if (integer_zerop (step))
4295 *inv_p = true;
4296 else
4297 *inv_p = false;
4299 /* Create an expression for the first address accessed by this load
4300 in LOOP. */
4301 base_name = get_name (DR_BASE_ADDRESS (dr));
4303 if (dump_enabled_p ())
4305 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4306 dump_printf_loc (MSG_NOTE, vect_location,
4307 "create %s-pointer variable to type: ",
4308 get_tree_code_name (TREE_CODE (aggr_type)));
4309 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4310 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4311 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4312 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4313 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4314 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4315 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4316 else
4317 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4318 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4319 dump_printf (MSG_NOTE, "\n");
4322 /* (1) Create the new aggregate-pointer variable.
4323 Vector and array types inherit the alias set of their component
4324 type by default so we need to use a ref-all pointer if the data
4325 reference does not conflict with the created aggregated data
4326 reference because it is not addressable. */
4327 bool need_ref_all = false;
4328 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4329 get_alias_set (DR_REF (dr))))
4330 need_ref_all = true;
4331 /* Likewise for any of the data references in the stmt group. */
4332 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4334 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4337 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4338 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4339 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4340 get_alias_set (DR_REF (sdr))))
4342 need_ref_all = true;
4343 break;
4345 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4347 while (orig_stmt);
4349 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4350 need_ref_all);
4351 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4354 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4355 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4356 def-use update cycles for the pointer: one relative to the outer-loop
4357 (LOOP), which is what steps (3) and (4) below do. The other is relative
4358 to the inner-loop (which is the inner-most loop containing the dataref),
4359 and this is done be step (5) below.
4361 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4362 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4363 redundant. Steps (3),(4) create the following:
4365 vp0 = &base_addr;
4366 LOOP: vp1 = phi(vp0,vp2)
4369 vp2 = vp1 + step
4370 goto LOOP
4372 If there is an inner-loop nested in loop, then step (5) will also be
4373 applied, and an additional update in the inner-loop will be created:
4375 vp0 = &base_addr;
4376 LOOP: vp1 = phi(vp0,vp2)
4378 inner: vp3 = phi(vp1,vp4)
4379 vp4 = vp3 + inner_step
4380 if () goto inner
4382 vp2 = vp1 + step
4383 if () goto LOOP */
4385 /* (2) Calculate the initial address of the aggregate-pointer, and set
4386 the aggregate-pointer to point to it before the loop. */
4388 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4390 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4391 offset, loop, byte_offset);
4392 if (new_stmt_list)
4394 if (pe)
4396 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4397 gcc_assert (!new_bb);
4399 else
4400 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4403 *initial_address = new_temp;
4404 aggr_ptr_init = new_temp;
4406 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4407 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4408 inner-loop nested in LOOP (during outer-loop vectorization). */
4410 /* No update in loop is required. */
4411 if (only_init && (!loop_vinfo || at_loop == loop))
4412 aptr = aggr_ptr_init;
4413 else
4415 /* The step of the aggregate pointer is the type size. */
4416 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4417 /* One exception to the above is when the scalar step of the load in
4418 LOOP is zero. In this case the step here is also zero. */
4419 if (*inv_p)
4420 iv_step = size_zero_node;
4421 else if (tree_int_cst_sgn (step) == -1)
4422 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4424 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4426 create_iv (aggr_ptr_init,
4427 fold_convert (aggr_ptr_type, iv_step),
4428 aggr_ptr, loop, &incr_gsi, insert_after,
4429 &indx_before_incr, &indx_after_incr);
4430 incr = gsi_stmt (incr_gsi);
4431 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4433 /* Copy the points-to information if it exists. */
4434 if (DR_PTR_INFO (dr))
4436 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4437 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4439 if (ptr_incr)
4440 *ptr_incr = incr;
4442 aptr = indx_before_incr;
4445 if (!nested_in_vect_loop || only_init)
4446 return aptr;
4449 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4450 nested in LOOP, if exists. */
4452 gcc_assert (nested_in_vect_loop);
4453 if (!only_init)
4455 standard_iv_increment_position (containing_loop, &incr_gsi,
4456 &insert_after);
4457 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4458 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4459 &indx_after_incr);
4460 incr = gsi_stmt (incr_gsi);
4461 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4463 /* Copy the points-to information if it exists. */
4464 if (DR_PTR_INFO (dr))
4466 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4467 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4469 if (ptr_incr)
4470 *ptr_incr = incr;
4472 return indx_before_incr;
4474 else
4475 gcc_unreachable ();
4479 /* Function bump_vector_ptr
4481 Increment a pointer (to a vector type) by vector-size. If requested,
4482 i.e. if PTR-INCR is given, then also connect the new increment stmt
4483 to the existing def-use update-chain of the pointer, by modifying
4484 the PTR_INCR as illustrated below:
4486 The pointer def-use update-chain before this function:
4487 DATAREF_PTR = phi (p_0, p_2)
4488 ....
4489 PTR_INCR: p_2 = DATAREF_PTR + step
4491 The pointer def-use update-chain after this function:
4492 DATAREF_PTR = phi (p_0, p_2)
4493 ....
4494 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4495 ....
4496 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4498 Input:
4499 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4500 in the loop.
4501 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4502 the loop. The increment amount across iterations is expected
4503 to be vector_size.
4504 BSI - location where the new update stmt is to be placed.
4505 STMT - the original scalar memory-access stmt that is being vectorized.
4506 BUMP - optional. The offset by which to bump the pointer. If not given,
4507 the offset is assumed to be vector_size.
4509 Output: Return NEW_DATAREF_PTR as illustrated above.
4513 tree
4514 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4515 gimple *stmt, tree bump)
4517 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4518 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4519 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4520 tree update = TYPE_SIZE_UNIT (vectype);
4521 gassign *incr_stmt;
4522 ssa_op_iter iter;
4523 use_operand_p use_p;
4524 tree new_dataref_ptr;
4526 if (bump)
4527 update = bump;
4529 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4530 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4531 else
4532 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4533 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4534 dataref_ptr, update);
4535 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4537 /* Copy the points-to information if it exists. */
4538 if (DR_PTR_INFO (dr))
4540 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4541 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4544 if (!ptr_incr)
4545 return new_dataref_ptr;
4547 /* Update the vector-pointer's cross-iteration increment. */
4548 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4550 tree use = USE_FROM_PTR (use_p);
4552 if (use == dataref_ptr)
4553 SET_USE (use_p, new_dataref_ptr);
4554 else
4555 gcc_assert (tree_int_cst_compare (use, update) == 0);
4558 return new_dataref_ptr;
4562 /* Function vect_create_destination_var.
4564 Create a new temporary of type VECTYPE. */
4566 tree
4567 vect_create_destination_var (tree scalar_dest, tree vectype)
4569 tree vec_dest;
4570 const char *name;
4571 char *new_name;
4572 tree type;
4573 enum vect_var_kind kind;
4575 kind = vectype
4576 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4577 ? vect_mask_var
4578 : vect_simple_var
4579 : vect_scalar_var;
4580 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4582 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4584 name = get_name (scalar_dest);
4585 if (name)
4586 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4587 else
4588 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4589 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4590 free (new_name);
4592 return vec_dest;
4595 /* Function vect_grouped_store_supported.
4597 Returns TRUE if interleave high and interleave low permutations
4598 are supported, and FALSE otherwise. */
4600 bool
4601 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4603 machine_mode mode = TYPE_MODE (vectype);
4605 /* vect_permute_store_chain requires the group size to be equal to 3 or
4606 be a power of two. */
4607 if (count != 3 && exact_log2 (count) == -1)
4609 if (dump_enabled_p ())
4610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4611 "the size of the group of accesses"
4612 " is not a power of 2 or not eqaul to 3\n");
4613 return false;
4616 /* Check that the permutation is supported. */
4617 if (VECTOR_MODE_P (mode))
4619 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4620 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4622 if (count == 3)
4624 unsigned int j0 = 0, j1 = 0, j2 = 0;
4625 unsigned int i, j;
4627 for (j = 0; j < 3; j++)
4629 int nelt0 = ((3 - j) * nelt) % 3;
4630 int nelt1 = ((3 - j) * nelt + 1) % 3;
4631 int nelt2 = ((3 - j) * nelt + 2) % 3;
4632 for (i = 0; i < nelt; i++)
4634 if (3 * i + nelt0 < nelt)
4635 sel[3 * i + nelt0] = j0++;
4636 if (3 * i + nelt1 < nelt)
4637 sel[3 * i + nelt1] = nelt + j1++;
4638 if (3 * i + nelt2 < nelt)
4639 sel[3 * i + nelt2] = 0;
4641 if (!can_vec_perm_p (mode, false, sel))
4643 if (dump_enabled_p ())
4644 dump_printf (MSG_MISSED_OPTIMIZATION,
4645 "permutaion op not supported by target.\n");
4646 return false;
4649 for (i = 0; i < nelt; i++)
4651 if (3 * i + nelt0 < nelt)
4652 sel[3 * i + nelt0] = 3 * i + nelt0;
4653 if (3 * i + nelt1 < nelt)
4654 sel[3 * i + nelt1] = 3 * i + nelt1;
4655 if (3 * i + nelt2 < nelt)
4656 sel[3 * i + nelt2] = nelt + j2++;
4658 if (!can_vec_perm_p (mode, false, sel))
4660 if (dump_enabled_p ())
4661 dump_printf (MSG_MISSED_OPTIMIZATION,
4662 "permutaion op not supported by target.\n");
4663 return false;
4666 return true;
4668 else
4670 /* If length is not equal to 3 then only power of 2 is supported. */
4671 gcc_assert (exact_log2 (count) != -1);
4673 for (i = 0; i < nelt / 2; i++)
4675 sel[i * 2] = i;
4676 sel[i * 2 + 1] = i + nelt;
4678 if (can_vec_perm_p (mode, false, sel))
4680 for (i = 0; i < nelt; i++)
4681 sel[i] += nelt / 2;
4682 if (can_vec_perm_p (mode, false, sel))
4683 return true;
4688 if (dump_enabled_p ())
4689 dump_printf (MSG_MISSED_OPTIMIZATION,
4690 "permutaion op not supported by target.\n");
4691 return false;
4695 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4696 type VECTYPE. */
4698 bool
4699 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4701 return vect_lanes_optab_supported_p ("vec_store_lanes",
4702 vec_store_lanes_optab,
4703 vectype, count);
4707 /* Function vect_permute_store_chain.
4709 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4710 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4711 the data correctly for the stores. Return the final references for stores
4712 in RESULT_CHAIN.
4714 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4715 The input is 4 vectors each containing 8 elements. We assign a number to
4716 each element, the input sequence is:
4718 1st vec: 0 1 2 3 4 5 6 7
4719 2nd vec: 8 9 10 11 12 13 14 15
4720 3rd vec: 16 17 18 19 20 21 22 23
4721 4th vec: 24 25 26 27 28 29 30 31
4723 The output sequence should be:
4725 1st vec: 0 8 16 24 1 9 17 25
4726 2nd vec: 2 10 18 26 3 11 19 27
4727 3rd vec: 4 12 20 28 5 13 21 30
4728 4th vec: 6 14 22 30 7 15 23 31
4730 i.e., we interleave the contents of the four vectors in their order.
4732 We use interleave_high/low instructions to create such output. The input of
4733 each interleave_high/low operation is two vectors:
4734 1st vec 2nd vec
4735 0 1 2 3 4 5 6 7
4736 the even elements of the result vector are obtained left-to-right from the
4737 high/low elements of the first vector. The odd elements of the result are
4738 obtained left-to-right from the high/low elements of the second vector.
4739 The output of interleave_high will be: 0 4 1 5
4740 and of interleave_low: 2 6 3 7
4743 The permutation is done in log LENGTH stages. In each stage interleave_high
4744 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4745 where the first argument is taken from the first half of DR_CHAIN and the
4746 second argument from it's second half.
4747 In our example,
4749 I1: interleave_high (1st vec, 3rd vec)
4750 I2: interleave_low (1st vec, 3rd vec)
4751 I3: interleave_high (2nd vec, 4th vec)
4752 I4: interleave_low (2nd vec, 4th vec)
4754 The output for the first stage is:
4756 I1: 0 16 1 17 2 18 3 19
4757 I2: 4 20 5 21 6 22 7 23
4758 I3: 8 24 9 25 10 26 11 27
4759 I4: 12 28 13 29 14 30 15 31
4761 The output of the second stage, i.e. the final result is:
4763 I1: 0 8 16 24 1 9 17 25
4764 I2: 2 10 18 26 3 11 19 27
4765 I3: 4 12 20 28 5 13 21 30
4766 I4: 6 14 22 30 7 15 23 31. */
4768 void
4769 vect_permute_store_chain (vec<tree> dr_chain,
4770 unsigned int length,
4771 gimple *stmt,
4772 gimple_stmt_iterator *gsi,
4773 vec<tree> *result_chain)
4775 tree vect1, vect2, high, low;
4776 gimple *perm_stmt;
4777 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4778 tree perm_mask_low, perm_mask_high;
4779 tree data_ref;
4780 tree perm3_mask_low, perm3_mask_high;
4781 unsigned int i, n, log_length = exact_log2 (length);
4782 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4783 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4785 result_chain->quick_grow (length);
4786 memcpy (result_chain->address (), dr_chain.address (),
4787 length * sizeof (tree));
4789 if (length == 3)
4791 unsigned int j0 = 0, j1 = 0, j2 = 0;
4793 for (j = 0; j < 3; j++)
4795 int nelt0 = ((3 - j) * nelt) % 3;
4796 int nelt1 = ((3 - j) * nelt + 1) % 3;
4797 int nelt2 = ((3 - j) * nelt + 2) % 3;
4799 for (i = 0; i < nelt; i++)
4801 if (3 * i + nelt0 < nelt)
4802 sel[3 * i + nelt0] = j0++;
4803 if (3 * i + nelt1 < nelt)
4804 sel[3 * i + nelt1] = nelt + j1++;
4805 if (3 * i + nelt2 < nelt)
4806 sel[3 * i + nelt2] = 0;
4808 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4810 for (i = 0; i < nelt; i++)
4812 if (3 * i + nelt0 < nelt)
4813 sel[3 * i + nelt0] = 3 * i + nelt0;
4814 if (3 * i + nelt1 < nelt)
4815 sel[3 * i + nelt1] = 3 * i + nelt1;
4816 if (3 * i + nelt2 < nelt)
4817 sel[3 * i + nelt2] = nelt + j2++;
4819 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4821 vect1 = dr_chain[0];
4822 vect2 = dr_chain[1];
4824 /* Create interleaving stmt:
4825 low = VEC_PERM_EXPR <vect1, vect2,
4826 {j, nelt, *, j + 1, nelt + j + 1, *,
4827 j + 2, nelt + j + 2, *, ...}> */
4828 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4829 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4830 vect2, perm3_mask_low);
4831 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4833 vect1 = data_ref;
4834 vect2 = dr_chain[2];
4835 /* Create interleaving stmt:
4836 low = VEC_PERM_EXPR <vect1, vect2,
4837 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4838 6, 7, nelt + j + 2, ...}> */
4839 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4840 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4841 vect2, perm3_mask_high);
4842 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4843 (*result_chain)[j] = data_ref;
4846 else
4848 /* If length is not equal to 3 then only power of 2 is supported. */
4849 gcc_assert (exact_log2 (length) != -1);
4851 for (i = 0, n = nelt / 2; i < n; i++)
4853 sel[i * 2] = i;
4854 sel[i * 2 + 1] = i + nelt;
4856 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4858 for (i = 0; i < nelt; i++)
4859 sel[i] += nelt / 2;
4860 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4862 for (i = 0, n = log_length; i < n; i++)
4864 for (j = 0; j < length/2; j++)
4866 vect1 = dr_chain[j];
4867 vect2 = dr_chain[j+length/2];
4869 /* Create interleaving stmt:
4870 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4871 ...}> */
4872 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4873 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4874 vect2, perm_mask_high);
4875 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4876 (*result_chain)[2*j] = high;
4878 /* Create interleaving stmt:
4879 low = VEC_PERM_EXPR <vect1, vect2,
4880 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4881 ...}> */
4882 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4883 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4884 vect2, perm_mask_low);
4885 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4886 (*result_chain)[2*j+1] = low;
4888 memcpy (dr_chain.address (), result_chain->address (),
4889 length * sizeof (tree));
4894 /* Function vect_setup_realignment
4896 This function is called when vectorizing an unaligned load using
4897 the dr_explicit_realign[_optimized] scheme.
4898 This function generates the following code at the loop prolog:
4900 p = initial_addr;
4901 x msq_init = *(floor(p)); # prolog load
4902 realignment_token = call target_builtin;
4903 loop:
4904 x msq = phi (msq_init, ---)
4906 The stmts marked with x are generated only for the case of
4907 dr_explicit_realign_optimized.
4909 The code above sets up a new (vector) pointer, pointing to the first
4910 location accessed by STMT, and a "floor-aligned" load using that pointer.
4911 It also generates code to compute the "realignment-token" (if the relevant
4912 target hook was defined), and creates a phi-node at the loop-header bb
4913 whose arguments are the result of the prolog-load (created by this
4914 function) and the result of a load that takes place in the loop (to be
4915 created by the caller to this function).
4917 For the case of dr_explicit_realign_optimized:
4918 The caller to this function uses the phi-result (msq) to create the
4919 realignment code inside the loop, and sets up the missing phi argument,
4920 as follows:
4921 loop:
4922 msq = phi (msq_init, lsq)
4923 lsq = *(floor(p')); # load in loop
4924 result = realign_load (msq, lsq, realignment_token);
4926 For the case of dr_explicit_realign:
4927 loop:
4928 msq = *(floor(p)); # load in loop
4929 p' = p + (VS-1);
4930 lsq = *(floor(p')); # load in loop
4931 result = realign_load (msq, lsq, realignment_token);
4933 Input:
4934 STMT - (scalar) load stmt to be vectorized. This load accesses
4935 a memory location that may be unaligned.
4936 BSI - place where new code is to be inserted.
4937 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4938 is used.
4940 Output:
4941 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4942 target hook, if defined.
4943 Return value - the result of the loop-header phi node. */
4945 tree
4946 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4947 tree *realignment_token,
4948 enum dr_alignment_support alignment_support_scheme,
4949 tree init_addr,
4950 struct loop **at_loop)
4952 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4953 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4954 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4955 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4956 struct loop *loop = NULL;
4957 edge pe = NULL;
4958 tree scalar_dest = gimple_assign_lhs (stmt);
4959 tree vec_dest;
4960 gimple *inc;
4961 tree ptr;
4962 tree data_ref;
4963 basic_block new_bb;
4964 tree msq_init = NULL_TREE;
4965 tree new_temp;
4966 gphi *phi_stmt;
4967 tree msq = NULL_TREE;
4968 gimple_seq stmts = NULL;
4969 bool inv_p;
4970 bool compute_in_loop = false;
4971 bool nested_in_vect_loop = false;
4972 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4973 struct loop *loop_for_initial_load = NULL;
4975 if (loop_vinfo)
4977 loop = LOOP_VINFO_LOOP (loop_vinfo);
4978 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4981 gcc_assert (alignment_support_scheme == dr_explicit_realign
4982 || alignment_support_scheme == dr_explicit_realign_optimized);
4984 /* We need to generate three things:
4985 1. the misalignment computation
4986 2. the extra vector load (for the optimized realignment scheme).
4987 3. the phi node for the two vectors from which the realignment is
4988 done (for the optimized realignment scheme). */
4990 /* 1. Determine where to generate the misalignment computation.
4992 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4993 calculation will be generated by this function, outside the loop (in the
4994 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4995 caller, inside the loop.
4997 Background: If the misalignment remains fixed throughout the iterations of
4998 the loop, then both realignment schemes are applicable, and also the
4999 misalignment computation can be done outside LOOP. This is because we are
5000 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5001 are a multiple of VS (the Vector Size), and therefore the misalignment in
5002 different vectorized LOOP iterations is always the same.
5003 The problem arises only if the memory access is in an inner-loop nested
5004 inside LOOP, which is now being vectorized using outer-loop vectorization.
5005 This is the only case when the misalignment of the memory access may not
5006 remain fixed throughout the iterations of the inner-loop (as explained in
5007 detail in vect_supportable_dr_alignment). In this case, not only is the
5008 optimized realignment scheme not applicable, but also the misalignment
5009 computation (and generation of the realignment token that is passed to
5010 REALIGN_LOAD) have to be done inside the loop.
5012 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5013 or not, which in turn determines if the misalignment is computed inside
5014 the inner-loop, or outside LOOP. */
5016 if (init_addr != NULL_TREE || !loop_vinfo)
5018 compute_in_loop = true;
5019 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5023 /* 2. Determine where to generate the extra vector load.
5025 For the optimized realignment scheme, instead of generating two vector
5026 loads in each iteration, we generate a single extra vector load in the
5027 preheader of the loop, and in each iteration reuse the result of the
5028 vector load from the previous iteration. In case the memory access is in
5029 an inner-loop nested inside LOOP, which is now being vectorized using
5030 outer-loop vectorization, we need to determine whether this initial vector
5031 load should be generated at the preheader of the inner-loop, or can be
5032 generated at the preheader of LOOP. If the memory access has no evolution
5033 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5034 to be generated inside LOOP (in the preheader of the inner-loop). */
5036 if (nested_in_vect_loop)
5038 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5039 bool invariant_in_outerloop =
5040 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5041 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5043 else
5044 loop_for_initial_load = loop;
5045 if (at_loop)
5046 *at_loop = loop_for_initial_load;
5048 if (loop_for_initial_load)
5049 pe = loop_preheader_edge (loop_for_initial_load);
5051 /* 3. For the case of the optimized realignment, create the first vector
5052 load at the loop preheader. */
5054 if (alignment_support_scheme == dr_explicit_realign_optimized)
5056 /* Create msq_init = *(floor(p1)) in the loop preheader */
5057 gassign *new_stmt;
5059 gcc_assert (!compute_in_loop);
5060 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5061 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5062 NULL_TREE, &init_addr, NULL, &inc,
5063 true, &inv_p);
5064 if (TREE_CODE (ptr) == SSA_NAME)
5065 new_temp = copy_ssa_name (ptr);
5066 else
5067 new_temp = make_ssa_name (TREE_TYPE (ptr));
5068 new_stmt = gimple_build_assign
5069 (new_temp, BIT_AND_EXPR, ptr,
5070 build_int_cst (TREE_TYPE (ptr),
5071 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5072 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5073 gcc_assert (!new_bb);
5074 data_ref
5075 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5076 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5077 new_stmt = gimple_build_assign (vec_dest, data_ref);
5078 new_temp = make_ssa_name (vec_dest, new_stmt);
5079 gimple_assign_set_lhs (new_stmt, new_temp);
5080 if (pe)
5082 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5083 gcc_assert (!new_bb);
5085 else
5086 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5088 msq_init = gimple_assign_lhs (new_stmt);
5091 /* 4. Create realignment token using a target builtin, if available.
5092 It is done either inside the containing loop, or before LOOP (as
5093 determined above). */
5095 if (targetm.vectorize.builtin_mask_for_load)
5097 gcall *new_stmt;
5098 tree builtin_decl;
5100 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5101 if (!init_addr)
5103 /* Generate the INIT_ADDR computation outside LOOP. */
5104 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5105 NULL_TREE, loop);
5106 if (loop)
5108 pe = loop_preheader_edge (loop);
5109 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5110 gcc_assert (!new_bb);
5112 else
5113 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5116 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5117 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5118 vec_dest =
5119 vect_create_destination_var (scalar_dest,
5120 gimple_call_return_type (new_stmt));
5121 new_temp = make_ssa_name (vec_dest, new_stmt);
5122 gimple_call_set_lhs (new_stmt, new_temp);
5124 if (compute_in_loop)
5125 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5126 else
5128 /* Generate the misalignment computation outside LOOP. */
5129 pe = loop_preheader_edge (loop);
5130 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5131 gcc_assert (!new_bb);
5134 *realignment_token = gimple_call_lhs (new_stmt);
5136 /* The result of the CALL_EXPR to this builtin is determined from
5137 the value of the parameter and no global variables are touched
5138 which makes the builtin a "const" function. Requiring the
5139 builtin to have the "const" attribute makes it unnecessary
5140 to call mark_call_clobbered. */
5141 gcc_assert (TREE_READONLY (builtin_decl));
5144 if (alignment_support_scheme == dr_explicit_realign)
5145 return msq;
5147 gcc_assert (!compute_in_loop);
5148 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5151 /* 5. Create msq = phi <msq_init, lsq> in loop */
5153 pe = loop_preheader_edge (containing_loop);
5154 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5155 msq = make_ssa_name (vec_dest);
5156 phi_stmt = create_phi_node (msq, containing_loop->header);
5157 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5159 return msq;
5163 /* Function vect_grouped_load_supported.
5165 Returns TRUE if even and odd permutations are supported,
5166 and FALSE otherwise. */
5168 bool
5169 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5171 machine_mode mode = TYPE_MODE (vectype);
5173 /* vect_permute_load_chain requires the group size to be equal to 3 or
5174 be a power of two. */
5175 if (count != 3 && exact_log2 (count) == -1)
5177 if (dump_enabled_p ())
5178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5179 "the size of the group of accesses"
5180 " is not a power of 2 or not equal to 3\n");
5181 return false;
5184 /* Check that the permutation is supported. */
5185 if (VECTOR_MODE_P (mode))
5187 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5188 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5190 if (count == 3)
5192 unsigned int k;
5193 for (k = 0; k < 3; k++)
5195 for (i = 0; i < nelt; i++)
5196 if (3 * i + k < 2 * nelt)
5197 sel[i] = 3 * i + k;
5198 else
5199 sel[i] = 0;
5200 if (!can_vec_perm_p (mode, false, sel))
5202 if (dump_enabled_p ())
5203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5204 "shuffle of 3 loads is not supported by"
5205 " target\n");
5206 return false;
5208 for (i = 0, j = 0; i < nelt; i++)
5209 if (3 * i + k < 2 * nelt)
5210 sel[i] = i;
5211 else
5212 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5213 if (!can_vec_perm_p (mode, false, sel))
5215 if (dump_enabled_p ())
5216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5217 "shuffle of 3 loads is not supported by"
5218 " target\n");
5219 return false;
5222 return true;
5224 else
5226 /* If length is not equal to 3 then only power of 2 is supported. */
5227 gcc_assert (exact_log2 (count) != -1);
5228 for (i = 0; i < nelt; i++)
5229 sel[i] = i * 2;
5230 if (can_vec_perm_p (mode, false, sel))
5232 for (i = 0; i < nelt; i++)
5233 sel[i] = i * 2 + 1;
5234 if (can_vec_perm_p (mode, false, sel))
5235 return true;
5240 if (dump_enabled_p ())
5241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5242 "extract even/odd not supported by target\n");
5243 return false;
5246 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5247 type VECTYPE. */
5249 bool
5250 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5252 return vect_lanes_optab_supported_p ("vec_load_lanes",
5253 vec_load_lanes_optab,
5254 vectype, count);
5257 /* Function vect_permute_load_chain.
5259 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5260 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5261 the input data correctly. Return the final references for loads in
5262 RESULT_CHAIN.
5264 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5265 The input is 4 vectors each containing 8 elements. We assign a number to each
5266 element, the input sequence is:
5268 1st vec: 0 1 2 3 4 5 6 7
5269 2nd vec: 8 9 10 11 12 13 14 15
5270 3rd vec: 16 17 18 19 20 21 22 23
5271 4th vec: 24 25 26 27 28 29 30 31
5273 The output sequence should be:
5275 1st vec: 0 4 8 12 16 20 24 28
5276 2nd vec: 1 5 9 13 17 21 25 29
5277 3rd vec: 2 6 10 14 18 22 26 30
5278 4th vec: 3 7 11 15 19 23 27 31
5280 i.e., the first output vector should contain the first elements of each
5281 interleaving group, etc.
5283 We use extract_even/odd instructions to create such output. The input of
5284 each extract_even/odd operation is two vectors
5285 1st vec 2nd vec
5286 0 1 2 3 4 5 6 7
5288 and the output is the vector of extracted even/odd elements. The output of
5289 extract_even will be: 0 2 4 6
5290 and of extract_odd: 1 3 5 7
5293 The permutation is done in log LENGTH stages. In each stage extract_even
5294 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5295 their order. In our example,
5297 E1: extract_even (1st vec, 2nd vec)
5298 E2: extract_odd (1st vec, 2nd vec)
5299 E3: extract_even (3rd vec, 4th vec)
5300 E4: extract_odd (3rd vec, 4th vec)
5302 The output for the first stage will be:
5304 E1: 0 2 4 6 8 10 12 14
5305 E2: 1 3 5 7 9 11 13 15
5306 E3: 16 18 20 22 24 26 28 30
5307 E4: 17 19 21 23 25 27 29 31
5309 In order to proceed and create the correct sequence for the next stage (or
5310 for the correct output, if the second stage is the last one, as in our
5311 example), we first put the output of extract_even operation and then the
5312 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5313 The input for the second stage is:
5315 1st vec (E1): 0 2 4 6 8 10 12 14
5316 2nd vec (E3): 16 18 20 22 24 26 28 30
5317 3rd vec (E2): 1 3 5 7 9 11 13 15
5318 4th vec (E4): 17 19 21 23 25 27 29 31
5320 The output of the second stage:
5322 E1: 0 4 8 12 16 20 24 28
5323 E2: 2 6 10 14 18 22 26 30
5324 E3: 1 5 9 13 17 21 25 29
5325 E4: 3 7 11 15 19 23 27 31
5327 And RESULT_CHAIN after reordering:
5329 1st vec (E1): 0 4 8 12 16 20 24 28
5330 2nd vec (E3): 1 5 9 13 17 21 25 29
5331 3rd vec (E2): 2 6 10 14 18 22 26 30
5332 4th vec (E4): 3 7 11 15 19 23 27 31. */
5334 static void
5335 vect_permute_load_chain (vec<tree> dr_chain,
5336 unsigned int length,
5337 gimple *stmt,
5338 gimple_stmt_iterator *gsi,
5339 vec<tree> *result_chain)
5341 tree data_ref, first_vect, second_vect;
5342 tree perm_mask_even, perm_mask_odd;
5343 tree perm3_mask_low, perm3_mask_high;
5344 gimple *perm_stmt;
5345 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5346 unsigned int i, j, log_length = exact_log2 (length);
5347 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5348 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5350 result_chain->quick_grow (length);
5351 memcpy (result_chain->address (), dr_chain.address (),
5352 length * sizeof (tree));
5354 if (length == 3)
5356 unsigned int k;
5358 for (k = 0; k < 3; k++)
5360 for (i = 0; i < nelt; i++)
5361 if (3 * i + k < 2 * nelt)
5362 sel[i] = 3 * i + k;
5363 else
5364 sel[i] = 0;
5365 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5367 for (i = 0, j = 0; i < nelt; i++)
5368 if (3 * i + k < 2 * nelt)
5369 sel[i] = i;
5370 else
5371 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5373 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5375 first_vect = dr_chain[0];
5376 second_vect = dr_chain[1];
5378 /* Create interleaving stmt (low part of):
5379 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5380 ...}> */
5381 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5382 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5383 second_vect, perm3_mask_low);
5384 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5386 /* Create interleaving stmt (high part of):
5387 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5388 ...}> */
5389 first_vect = data_ref;
5390 second_vect = dr_chain[2];
5391 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5392 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5393 second_vect, perm3_mask_high);
5394 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5395 (*result_chain)[k] = data_ref;
5398 else
5400 /* If length is not equal to 3 then only power of 2 is supported. */
5401 gcc_assert (exact_log2 (length) != -1);
5403 for (i = 0; i < nelt; ++i)
5404 sel[i] = i * 2;
5405 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5407 for (i = 0; i < nelt; ++i)
5408 sel[i] = i * 2 + 1;
5409 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5411 for (i = 0; i < log_length; i++)
5413 for (j = 0; j < length; j += 2)
5415 first_vect = dr_chain[j];
5416 second_vect = dr_chain[j+1];
5418 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5419 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5420 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5421 first_vect, second_vect,
5422 perm_mask_even);
5423 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5424 (*result_chain)[j/2] = data_ref;
5426 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5427 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5428 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5429 first_vect, second_vect,
5430 perm_mask_odd);
5431 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5432 (*result_chain)[j/2+length/2] = data_ref;
5434 memcpy (dr_chain.address (), result_chain->address (),
5435 length * sizeof (tree));
5440 /* Function vect_shift_permute_load_chain.
5442 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5443 sequence of stmts to reorder the input data accordingly.
5444 Return the final references for loads in RESULT_CHAIN.
5445 Return true if successed, false otherwise.
5447 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5448 The input is 3 vectors each containing 8 elements. We assign a
5449 number to each element, the input sequence is:
5451 1st vec: 0 1 2 3 4 5 6 7
5452 2nd vec: 8 9 10 11 12 13 14 15
5453 3rd vec: 16 17 18 19 20 21 22 23
5455 The output sequence should be:
5457 1st vec: 0 3 6 9 12 15 18 21
5458 2nd vec: 1 4 7 10 13 16 19 22
5459 3rd vec: 2 5 8 11 14 17 20 23
5461 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5463 First we shuffle all 3 vectors to get correct elements order:
5465 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5466 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5467 3rd vec: (16 19 22) (17 20 23) (18 21)
5469 Next we unite and shift vector 3 times:
5471 1st step:
5472 shift right by 6 the concatenation of:
5473 "1st vec" and "2nd vec"
5474 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5475 "2nd vec" and "3rd vec"
5476 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5477 "3rd vec" and "1st vec"
5478 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5479 | New vectors |
5481 So that now new vectors are:
5483 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5484 2nd vec: (10 13) (16 19 22) (17 20 23)
5485 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5487 2nd step:
5488 shift right by 5 the concatenation of:
5489 "1st vec" and "3rd vec"
5490 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5491 "2nd vec" and "1st vec"
5492 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5493 "3rd vec" and "2nd vec"
5494 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5495 | New vectors |
5497 So that now new vectors are:
5499 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5500 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5501 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5503 3rd step:
5504 shift right by 5 the concatenation of:
5505 "1st vec" and "1st vec"
5506 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5507 shift right by 3 the concatenation of:
5508 "2nd vec" and "2nd vec"
5509 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5510 | New vectors |
5512 So that now all vectors are READY:
5513 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5514 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5515 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5517 This algorithm is faster than one in vect_permute_load_chain if:
5518 1. "shift of a concatination" is faster than general permutation.
5519 This is usually so.
5520 2. The TARGET machine can't execute vector instructions in parallel.
5521 This is because each step of the algorithm depends on previous.
5522 The algorithm in vect_permute_load_chain is much more parallel.
5524 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5527 static bool
5528 vect_shift_permute_load_chain (vec<tree> dr_chain,
5529 unsigned int length,
5530 gimple *stmt,
5531 gimple_stmt_iterator *gsi,
5532 vec<tree> *result_chain)
5534 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5535 tree perm2_mask1, perm2_mask2, perm3_mask;
5536 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5537 gimple *perm_stmt;
5539 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5540 unsigned int i;
5541 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5542 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5543 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5544 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5546 result_chain->quick_grow (length);
5547 memcpy (result_chain->address (), dr_chain.address (),
5548 length * sizeof (tree));
5550 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5552 unsigned int j, log_length = exact_log2 (length);
5553 for (i = 0; i < nelt / 2; ++i)
5554 sel[i] = i * 2;
5555 for (i = 0; i < nelt / 2; ++i)
5556 sel[nelt / 2 + i] = i * 2 + 1;
5557 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5559 if (dump_enabled_p ())
5560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5561 "shuffle of 2 fields structure is not \
5562 supported by target\n");
5563 return false;
5565 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5567 for (i = 0; i < nelt / 2; ++i)
5568 sel[i] = i * 2 + 1;
5569 for (i = 0; i < nelt / 2; ++i)
5570 sel[nelt / 2 + i] = i * 2;
5571 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5573 if (dump_enabled_p ())
5574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5575 "shuffle of 2 fields structure is not \
5576 supported by target\n");
5577 return false;
5579 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5581 /* Generating permutation constant to shift all elements.
5582 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5583 for (i = 0; i < nelt; i++)
5584 sel[i] = nelt / 2 + i;
5585 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5587 if (dump_enabled_p ())
5588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5589 "shift permutation is not supported by target\n");
5590 return false;
5592 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5594 /* Generating permutation constant to select vector from 2.
5595 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5596 for (i = 0; i < nelt / 2; i++)
5597 sel[i] = i;
5598 for (i = nelt / 2; i < nelt; i++)
5599 sel[i] = nelt + i;
5600 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5602 if (dump_enabled_p ())
5603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5604 "select is not supported by target\n");
5605 return false;
5607 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5609 for (i = 0; i < log_length; i++)
5611 for (j = 0; j < length; j += 2)
5613 first_vect = dr_chain[j];
5614 second_vect = dr_chain[j + 1];
5616 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5617 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5618 first_vect, first_vect,
5619 perm2_mask1);
5620 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5621 vect[0] = data_ref;
5623 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5624 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5625 second_vect, second_vect,
5626 perm2_mask2);
5627 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5628 vect[1] = data_ref;
5630 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5631 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5632 vect[0], vect[1], shift1_mask);
5633 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5634 (*result_chain)[j/2 + length/2] = data_ref;
5636 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5637 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5638 vect[0], vect[1], select_mask);
5639 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5640 (*result_chain)[j/2] = data_ref;
5642 memcpy (dr_chain.address (), result_chain->address (),
5643 length * sizeof (tree));
5645 return true;
5647 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5649 unsigned int k = 0, l = 0;
5651 /* Generating permutation constant to get all elements in rigth order.
5652 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5653 for (i = 0; i < nelt; i++)
5655 if (3 * k + (l % 3) >= nelt)
5657 k = 0;
5658 l += (3 - (nelt % 3));
5660 sel[i] = 3 * k + (l % 3);
5661 k++;
5663 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5665 if (dump_enabled_p ())
5666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667 "shuffle of 3 fields structure is not \
5668 supported by target\n");
5669 return false;
5671 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5673 /* Generating permutation constant to shift all elements.
5674 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5675 for (i = 0; i < nelt; i++)
5676 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5677 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5679 if (dump_enabled_p ())
5680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5681 "shift permutation is not supported by target\n");
5682 return false;
5684 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5686 /* Generating permutation constant to shift all elements.
5687 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5688 for (i = 0; i < nelt; i++)
5689 sel[i] = 2 * (nelt / 3) + 1 + i;
5690 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5694 "shift permutation is not supported by target\n");
5695 return false;
5697 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5699 /* Generating permutation constant to shift all elements.
5700 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5701 for (i = 0; i < nelt; i++)
5702 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5703 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5705 if (dump_enabled_p ())
5706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5707 "shift permutation is not supported by target\n");
5708 return false;
5710 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5712 /* Generating permutation constant to shift all elements.
5713 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5714 for (i = 0; i < nelt; i++)
5715 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5716 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5718 if (dump_enabled_p ())
5719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5720 "shift permutation is not supported by target\n");
5721 return false;
5723 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5725 for (k = 0; k < 3; k++)
5727 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5728 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5729 dr_chain[k], dr_chain[k],
5730 perm3_mask);
5731 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5732 vect[k] = data_ref;
5735 for (k = 0; k < 3; k++)
5737 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5738 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5739 vect[k % 3], vect[(k + 1) % 3],
5740 shift1_mask);
5741 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5742 vect_shift[k] = data_ref;
5745 for (k = 0; k < 3; k++)
5747 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5748 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5749 vect_shift[(4 - k) % 3],
5750 vect_shift[(3 - k) % 3],
5751 shift2_mask);
5752 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5753 vect[k] = data_ref;
5756 (*result_chain)[3 - (nelt % 3)] = vect[2];
5758 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5759 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5760 vect[0], shift3_mask);
5761 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5762 (*result_chain)[nelt % 3] = data_ref;
5764 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5765 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5766 vect[1], shift4_mask);
5767 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5768 (*result_chain)[0] = data_ref;
5769 return true;
5771 return false;
5774 /* Function vect_transform_grouped_load.
5776 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5777 to perform their permutation and ascribe the result vectorized statements to
5778 the scalar statements.
5781 void
5782 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5783 gimple_stmt_iterator *gsi)
5785 machine_mode mode;
5786 vec<tree> result_chain = vNULL;
5788 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5789 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5790 vectors, that are ready for vector computation. */
5791 result_chain.create (size);
5793 /* If reassociation width for vector type is 2 or greater target machine can
5794 execute 2 or more vector instructions in parallel. Otherwise try to
5795 get chain for loads group using vect_shift_permute_load_chain. */
5796 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5797 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5798 || exact_log2 (size) != -1
5799 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5800 gsi, &result_chain))
5801 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5802 vect_record_grouped_load_vectors (stmt, result_chain);
5803 result_chain.release ();
5806 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5807 generated as part of the vectorization of STMT. Assign the statement
5808 for each vector to the associated scalar statement. */
5810 void
5811 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5813 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5814 gimple *next_stmt, *new_stmt;
5815 unsigned int i, gap_count;
5816 tree tmp_data_ref;
5818 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5819 Since we scan the chain starting from it's first node, their order
5820 corresponds the order of data-refs in RESULT_CHAIN. */
5821 next_stmt = first_stmt;
5822 gap_count = 1;
5823 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5825 if (!next_stmt)
5826 break;
5828 /* Skip the gaps. Loads created for the gaps will be removed by dead
5829 code elimination pass later. No need to check for the first stmt in
5830 the group, since it always exists.
5831 GROUP_GAP is the number of steps in elements from the previous
5832 access (if there is no gap GROUP_GAP is 1). We skip loads that
5833 correspond to the gaps. */
5834 if (next_stmt != first_stmt
5835 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5837 gap_count++;
5838 continue;
5841 while (next_stmt)
5843 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5844 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5845 copies, and we put the new vector statement in the first available
5846 RELATED_STMT. */
5847 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5848 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5849 else
5851 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5853 gimple *prev_stmt =
5854 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5855 gimple *rel_stmt =
5856 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5857 while (rel_stmt)
5859 prev_stmt = rel_stmt;
5860 rel_stmt =
5861 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5864 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5865 new_stmt;
5869 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5870 gap_count = 1;
5871 /* If NEXT_STMT accesses the same DR as the previous statement,
5872 put the same TMP_DATA_REF as its vectorized statement; otherwise
5873 get the next data-ref from RESULT_CHAIN. */
5874 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5875 break;
5880 /* Function vect_force_dr_alignment_p.
5882 Returns whether the alignment of a DECL can be forced to be aligned
5883 on ALIGNMENT bit boundary. */
5885 bool
5886 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5888 if (TREE_CODE (decl) != VAR_DECL)
5889 return false;
5891 if (decl_in_symtab_p (decl)
5892 && !symtab_node::get (decl)->can_increase_alignment_p ())
5893 return false;
5895 if (TREE_STATIC (decl))
5896 return (alignment <= MAX_OFILE_ALIGNMENT);
5897 else
5898 return (alignment <= MAX_STACK_ALIGNMENT);
5902 /* Return whether the data reference DR is supported with respect to its
5903 alignment.
5904 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5905 it is aligned, i.e., check if it is possible to vectorize it with different
5906 alignment. */
5908 enum dr_alignment_support
5909 vect_supportable_dr_alignment (struct data_reference *dr,
5910 bool check_aligned_accesses)
5912 gimple *stmt = DR_STMT (dr);
5913 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5914 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5915 machine_mode mode = TYPE_MODE (vectype);
5916 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5917 struct loop *vect_loop = NULL;
5918 bool nested_in_vect_loop = false;
5920 if (aligned_access_p (dr) && !check_aligned_accesses)
5921 return dr_aligned;
5923 /* For now assume all conditional loads/stores support unaligned
5924 access without any special code. */
5925 if (is_gimple_call (stmt)
5926 && gimple_call_internal_p (stmt)
5927 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5928 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5929 return dr_unaligned_supported;
5931 if (loop_vinfo)
5933 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5934 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5937 /* Possibly unaligned access. */
5939 /* We can choose between using the implicit realignment scheme (generating
5940 a misaligned_move stmt) and the explicit realignment scheme (generating
5941 aligned loads with a REALIGN_LOAD). There are two variants to the
5942 explicit realignment scheme: optimized, and unoptimized.
5943 We can optimize the realignment only if the step between consecutive
5944 vector loads is equal to the vector size. Since the vector memory
5945 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5946 is guaranteed that the misalignment amount remains the same throughout the
5947 execution of the vectorized loop. Therefore, we can create the
5948 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5949 at the loop preheader.
5951 However, in the case of outer-loop vectorization, when vectorizing a
5952 memory access in the inner-loop nested within the LOOP that is now being
5953 vectorized, while it is guaranteed that the misalignment of the
5954 vectorized memory access will remain the same in different outer-loop
5955 iterations, it is *not* guaranteed that is will remain the same throughout
5956 the execution of the inner-loop. This is because the inner-loop advances
5957 with the original scalar step (and not in steps of VS). If the inner-loop
5958 step happens to be a multiple of VS, then the misalignment remains fixed
5959 and we can use the optimized realignment scheme. For example:
5961 for (i=0; i<N; i++)
5962 for (j=0; j<M; j++)
5963 s += a[i+j];
5965 When vectorizing the i-loop in the above example, the step between
5966 consecutive vector loads is 1, and so the misalignment does not remain
5967 fixed across the execution of the inner-loop, and the realignment cannot
5968 be optimized (as illustrated in the following pseudo vectorized loop):
5970 for (i=0; i<N; i+=4)
5971 for (j=0; j<M; j++){
5972 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5973 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5974 // (assuming that we start from an aligned address).
5977 We therefore have to use the unoptimized realignment scheme:
5979 for (i=0; i<N; i+=4)
5980 for (j=k; j<M; j+=4)
5981 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5982 // that the misalignment of the initial address is
5983 // 0).
5985 The loop can then be vectorized as follows:
5987 for (k=0; k<4; k++){
5988 rt = get_realignment_token (&vp[k]);
5989 for (i=0; i<N; i+=4){
5990 v1 = vp[i+k];
5991 for (j=k; j<M; j+=4){
5992 v2 = vp[i+j+VS-1];
5993 va = REALIGN_LOAD <v1,v2,rt>;
5994 vs += va;
5995 v1 = v2;
5998 } */
6000 if (DR_IS_READ (dr))
6002 bool is_packed = false;
6003 tree type = (TREE_TYPE (DR_REF (dr)));
6005 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6006 && (!targetm.vectorize.builtin_mask_for_load
6007 || targetm.vectorize.builtin_mask_for_load ()))
6009 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6011 /* If we are doing SLP then the accesses need not have the
6012 same alignment, instead it depends on the SLP group size. */
6013 if (loop_vinfo
6014 && STMT_SLP_TYPE (stmt_info)
6015 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6016 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6017 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6019 else if (!loop_vinfo
6020 || (nested_in_vect_loop
6021 && (TREE_INT_CST_LOW (DR_STEP (dr))
6022 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6023 return dr_explicit_realign;
6024 else
6025 return dr_explicit_realign_optimized;
6027 if (!known_alignment_for_access_p (dr))
6028 is_packed = not_size_aligned (DR_REF (dr));
6030 if ((TYPE_USER_ALIGN (type) && !is_packed)
6031 || targetm.vectorize.
6032 support_vector_misalignment (mode, type,
6033 DR_MISALIGNMENT (dr), is_packed))
6034 /* Can't software pipeline the loads, but can at least do them. */
6035 return dr_unaligned_supported;
6037 else
6039 bool is_packed = false;
6040 tree type = (TREE_TYPE (DR_REF (dr)));
6042 if (!known_alignment_for_access_p (dr))
6043 is_packed = not_size_aligned (DR_REF (dr));
6045 if ((TYPE_USER_ALIGN (type) && !is_packed)
6046 || targetm.vectorize.
6047 support_vector_misalignment (mode, type,
6048 DR_MISALIGNMENT (dr), is_packed))
6049 return dr_unaligned_supported;
6052 /* Unsupported. */
6053 return dr_unaligned_unsupported;