[Patch AArch64 1/3] Enable CRC by default for armv8.1-a
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobf8695b13d6d2210864d95a3a39b40153dd8d0492
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
270 return true;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 return true;
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
344 if (dist == 0)
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
390 continue;
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
444 return true;
447 return false;
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
459 unsigned int i;
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
479 return true;
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
503 if (dra == drb)
504 return false;
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
554 group. */
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
560 continue;
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 continue;
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 if (!dr_b)
576 return false;
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
585 continue;
586 unsigned i;
587 gimple *store;
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
597 return false;
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
607 return false;
609 free_dependence_relation (ddr);
612 return true;
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632 store = NULL;
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
636 if (store)
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 return false;
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
647 bool res = true;
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
651 slp_tree load;
652 unsigned int i;
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
655 store
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
659 res = false;
660 break;
663 /* Unset the visited flag. */
664 if (store)
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
668 return res;
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
675 Output:
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
691 tree vectype;
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
694 tree aligned_to;
695 unsigned HOST_WIDE_INT alignment;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE, vect_location,
699 "vect_compute_data_ref_alignment:\n");
701 if (loop_vinfo)
702 loop = LOOP_VINFO_LOOP (loop_vinfo);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr)))
708 misalign = DR_INIT (dr);
709 aligned_to = DR_ALIGNED_TO (dr);
710 base_addr = DR_BASE_ADDRESS (dr);
711 vectype = STMT_VINFO_VECTYPE (stmt_info);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop && nested_in_vect_loop_p (loop, stmt))
721 tree step = DR_STEP (dr);
723 if (tree_fits_shwi_p (step)
724 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "inner step divides the vector-size.\n");
729 misalign = STMT_VINFO_DR_INIT (stmt_info);
730 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
731 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
733 else
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "inner step doesn't divide the vector-size.\n");
738 misalign = NULL_TREE;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
746 else
748 tree step = DR_STEP (dr);
749 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
751 if (tree_fits_shwi_p (step)
752 && ((tree_to_shwi (step) * vf)
753 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "step doesn't divide the vector-size.\n");
758 misalign = NULL_TREE;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
764 base = ref;
765 while (handled_component_p (base))
766 base = TREE_OPERAND (base, 0);
767 if (TREE_CODE (base) == MEM_REF)
768 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
770 unsigned int base_alignment = get_object_alignment (base);
772 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
773 DR_VECT_AUX (dr)->base_element_aligned = true;
775 alignment = TYPE_ALIGN_UNIT (vectype);
777 if ((compare_tree_int (aligned_to, alignment) < 0)
778 || !misalign)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
790 if (base_alignment < TYPE_ALIGN (vectype))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base, 1))
795 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
796 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
805 dump_printf (MSG_NOTE, "\n");
807 return true;
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
817 dump_printf (MSG_NOTE, "\n");
820 DR_VECT_AUX (dr)->base_decl = base;
821 DR_VECT_AUX (dr)->base_misaligned = true;
822 DR_VECT_AUX (dr)->base_element_aligned = true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
830 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
834 /* PLUS because DR_STEP was negative. */
835 misalign = size_binop (PLUS_EXPR, misalign, offset);
838 SET_DR_MISALIGNMENT (dr,
839 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
849 return true;
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
861 static void
862 vect_update_misalignment_for_peel (struct data_reference *dr,
863 struct data_reference *dr_peel, int npeel)
865 unsigned int i;
866 vec<dr_p> same_align_drs;
867 struct data_reference *current_dr;
868 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
869 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
870 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
871 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
876 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
878 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
882 same_align_drs
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
884 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
886 if (current_dr != dr)
887 continue;
888 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
889 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
890 SET_DR_MISALIGNMENT (dr, 0);
891 return;
894 if (known_alignment_for_access_p (dr)
895 && known_alignment_for_access_p (dr_peel))
897 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
898 int misal = DR_MISALIGNMENT (dr);
899 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
900 misal += negative ? -npeel * dr_size : npeel * dr_size;
901 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
902 SET_DR_MISALIGNMENT (dr, misal);
903 return;
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
916 static bool
917 verify_data_ref_alignment (data_reference_p dr)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr, false);
921 if (!supportable_dr_alignment)
923 if (dump_enabled_p ())
925 if (DR_IS_READ (dr))
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
927 "not vectorized: unsupported unaligned load.");
928 else
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "not vectorized: unsupported unaligned "
931 "store.");
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
934 DR_REF (dr));
935 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
937 return false;
940 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location,
942 "Vectorizing an unaligned access.\n");
944 return true;
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
952 bool
953 vect_verify_datarefs_alignment (loop_vec_info vinfo)
955 vec<data_reference_p> datarefs = vinfo->datarefs;
956 struct data_reference *dr;
957 unsigned int i;
959 FOR_EACH_VEC_ELT (datarefs, i, dr)
961 gimple *stmt = DR_STMT (dr);
962 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
965 continue;
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
969 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
970 continue;
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
976 continue;
978 if (! verify_data_ref_alignment (dr))
979 return false;
982 return true;
985 /* Given an memory reference EXP return whether its alignment is less
986 than its size. */
988 static bool
989 not_size_aligned (tree exp)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
992 return true;
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
995 > get_object_alignment (exp));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1003 static bool
1004 vector_alignment_reachable_p (struct data_reference *dr)
1006 gimple *stmt = DR_STMT (dr);
1007 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size, mis_in_elements;
1016 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr))
1020 return false;
1022 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1023 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1025 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1026 return false;
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1033 HOST_WIDE_INT elmsize =
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE, vect_location,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1039 dump_printf (MSG_NOTE,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1042 if (DR_MISALIGNMENT (dr) % elmsize)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "data size does not divide the misalignment.\n");
1047 return false;
1051 if (!known_alignment_for_access_p (dr))
1053 tree type = TREE_TYPE (DR_REF (dr));
1054 bool is_packed = not_size_aligned (DR_REF (dr));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Unknown misalignment, is_packed = %d\n",is_packed);
1058 if ((TYPE_USER_ALIGN (type) && !is_packed)
1059 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1060 return true;
1061 else
1062 return false;
1065 return true;
1069 /* Calculate the cost of the memory access represented by DR. */
1071 static void
1072 vect_get_data_access_cost (struct data_reference *dr,
1073 unsigned int *inside_cost,
1074 unsigned int *outside_cost,
1075 stmt_vector_for_cost *body_cost_vec)
1077 gimple *stmt = DR_STMT (dr);
1078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1079 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1081 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1082 int ncopies = vf / nunits;
1084 if (DR_IS_READ (dr))
1085 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1086 NULL, body_cost_vec, false);
1087 else
1088 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1097 typedef struct _vect_peel_info
1099 int npeel;
1100 struct data_reference *dr;
1101 unsigned int count;
1102 } *vect_peel_info;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info;
1107 unsigned int inside_cost;
1108 unsigned int outside_cost;
1109 stmt_vector_for_cost body_cost_vec;
1110 } *vect_peel_extended_info;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1117 static inline hashval_t hash (const _vect_peel_info *);
1118 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1121 inline hashval_t
1122 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1124 return (hashval_t) peel_info->npeel;
1127 inline bool
1128 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1130 return (a->npeel == b->npeel);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1136 static void
1137 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1138 loop_vec_info loop_vinfo, struct data_reference *dr,
1139 int npeel)
1141 struct _vect_peel_info elem, *slot;
1142 _vect_peel_info **new_slot;
1143 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1145 elem.npeel = npeel;
1146 slot = peeling_htab->find (&elem);
1147 if (slot)
1148 slot->count++;
1149 else
1151 slot = XNEW (struct _vect_peel_info);
1152 slot->npeel = npeel;
1153 slot->dr = dr;
1154 slot->count = 1;
1155 new_slot = peeling_htab->find_slot (slot, INSERT);
1156 *new_slot = slot;
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1161 slot->count += VECT_MAX_COST;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1170 _vect_peel_extended_info *max)
1172 vect_peel_info elem = *slot;
1174 if (elem->count > max->peel_info.count
1175 || (elem->count == max->peel_info.count
1176 && max->peel_info.npeel > elem->npeel))
1178 max->peel_info.npeel = elem->npeel;
1179 max->peel_info.count = elem->count;
1180 max->peel_info.dr = elem->dr;
1183 return 1;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1192 _vect_peel_extended_info *min)
1194 vect_peel_info elem = *slot;
1195 int save_misalignment, dummy;
1196 unsigned int inside_cost = 0, outside_cost = 0, i;
1197 gimple *stmt = DR_STMT (elem->dr);
1198 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1199 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1200 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1201 struct data_reference *dr;
1202 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1204 prologue_cost_vec.create (2);
1205 body_cost_vec.create (2);
1206 epilogue_cost_vec.create (2);
1208 FOR_EACH_VEC_ELT (datarefs, i, dr)
1210 stmt = DR_STMT (dr);
1211 stmt_info = vinfo_for_stmt (stmt);
1212 /* For interleaving, only the alignment of the first access
1213 matters. */
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1215 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1216 continue;
1218 /* Strided accesses perform only component accesses, alignment is
1219 irrelevant for them. */
1220 if (STMT_VINFO_STRIDED_P (stmt_info)
1221 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1222 continue;
1224 save_misalignment = DR_MISALIGNMENT (dr);
1225 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1226 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1227 &body_cost_vec);
1228 SET_DR_MISALIGNMENT (dr, save_misalignment);
1231 outside_cost += vect_get_known_peeling_cost
1232 (loop_vinfo, elem->npeel, &dummy,
1233 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1234 &prologue_cost_vec, &epilogue_cost_vec);
1236 /* Prologue and epilogue costs are added to the target model later.
1237 These costs depend only on the scalar iteration cost, the
1238 number of peeling iterations finally chosen, and the number of
1239 misaligned statements. So discard the information found here. */
1240 prologue_cost_vec.release ();
1241 epilogue_cost_vec.release ();
1243 if (inside_cost < min->inside_cost
1244 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1246 min->inside_cost = inside_cost;
1247 min->outside_cost = outside_cost;
1248 min->body_cost_vec.release ();
1249 min->body_cost_vec = body_cost_vec;
1250 min->peel_info.dr = elem->dr;
1251 min->peel_info.npeel = elem->npeel;
1253 else
1254 body_cost_vec.release ();
1256 return 1;
1260 /* Choose best peeling option by traversing peeling hash table and either
1261 choosing an option with the lowest cost (if cost model is enabled) or the
1262 option that aligns as many accesses as possible. */
1264 static struct data_reference *
1265 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1266 loop_vec_info loop_vinfo,
1267 unsigned int *npeel,
1268 stmt_vector_for_cost *body_cost_vec)
1270 struct _vect_peel_extended_info res;
1272 res.peel_info.dr = NULL;
1273 res.body_cost_vec = stmt_vector_for_cost ();
1275 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1277 res.inside_cost = INT_MAX;
1278 res.outside_cost = INT_MAX;
1279 peeling_htab->traverse <_vect_peel_extended_info *,
1280 vect_peeling_hash_get_lowest_cost> (&res);
1282 else
1284 res.peel_info.count = 0;
1285 peeling_htab->traverse <_vect_peel_extended_info *,
1286 vect_peeling_hash_get_most_frequent> (&res);
1289 *npeel = res.peel_info.npeel;
1290 *body_cost_vec = res.body_cost_vec;
1291 return res.peel_info.dr;
1295 /* Function vect_enhance_data_refs_alignment
1297 This pass will use loop versioning and loop peeling in order to enhance
1298 the alignment of data references in the loop.
1300 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1301 original loop is to be vectorized. Any other loops that are created by
1302 the transformations performed in this pass - are not supposed to be
1303 vectorized. This restriction will be relaxed.
1305 This pass will require a cost model to guide it whether to apply peeling
1306 or versioning or a combination of the two. For example, the scheme that
1307 intel uses when given a loop with several memory accesses, is as follows:
1308 choose one memory access ('p') which alignment you want to force by doing
1309 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1310 other accesses are not necessarily aligned, or (2) use loop versioning to
1311 generate one loop in which all accesses are aligned, and another loop in
1312 which only 'p' is necessarily aligned.
1314 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1315 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1316 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1318 Devising a cost model is the most critical aspect of this work. It will
1319 guide us on which access to peel for, whether to use loop versioning, how
1320 many versions to create, etc. The cost model will probably consist of
1321 generic considerations as well as target specific considerations (on
1322 powerpc for example, misaligned stores are more painful than misaligned
1323 loads).
1325 Here are the general steps involved in alignment enhancements:
1327 -- original loop, before alignment analysis:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- After vect_compute_data_refs_alignment:
1334 for (i=0; i<N; i++){
1335 x = q[i]; # DR_MISALIGNMENT(q) = 3
1336 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1339 -- Possibility 1: we do loop versioning:
1340 if (p is aligned) {
1341 for (i=0; i<N; i++){ # loop 1A
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = 0
1346 else {
1347 for (i=0; i<N; i++){ # loop 1B
1348 x = q[i]; # DR_MISALIGNMENT(q) = 3
1349 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1353 -- Possibility 2: we do loop peeling:
1354 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1355 x = q[i];
1356 p[i] = y;
1358 for (i = 3; i < N; i++){ # loop 2A
1359 x = q[i]; # DR_MISALIGNMENT(q) = 0
1360 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1363 -- Possibility 3: combination of loop peeling and versioning:
1364 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1365 x = q[i];
1366 p[i] = y;
1368 if (p is aligned) {
1369 for (i = 3; i<N; i++){ # loop 3A
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = 0
1374 else {
1375 for (i = 3; i<N; i++){ # loop 3B
1376 x = q[i]; # DR_MISALIGNMENT(q) = 0
1377 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1381 These loops are later passed to loop_transform to be vectorized. The
1382 vectorizer will use the alignment information to guide the transformation
1383 (whether to generate regular loads/stores, or with special handling for
1384 misalignment). */
1386 bool
1387 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1389 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1390 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1391 enum dr_alignment_support supportable_dr_alignment;
1392 struct data_reference *dr0 = NULL, *first_store = NULL;
1393 struct data_reference *dr;
1394 unsigned int i, j;
1395 bool do_peeling = false;
1396 bool do_versioning = false;
1397 bool stat;
1398 gimple *stmt;
1399 stmt_vec_info stmt_info;
1400 unsigned int npeel = 0;
1401 bool all_misalignments_unknown = true;
1402 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1403 unsigned possible_npeel_number = 1;
1404 tree vectype;
1405 unsigned int nelements, mis, same_align_drs_max = 0;
1406 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1407 hash_table<peel_info_hasher> peeling_htab (1);
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "=== vect_enhance_data_refs_alignment ===\n");
1413 /* Reset data so we can safely be called multiple times. */
1414 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1415 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1417 /* While cost model enhancements are expected in the future, the high level
1418 view of the code at this time is as follows:
1420 A) If there is a misaligned access then see if peeling to align
1421 this access can make all data references satisfy
1422 vect_supportable_dr_alignment. If so, update data structures
1423 as needed and return true.
1425 B) If peeling wasn't possible and there is a data reference with an
1426 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1427 then see if loop versioning checks can be used to make all data
1428 references satisfy vect_supportable_dr_alignment. If so, update
1429 data structures as needed and return true.
1431 C) If neither peeling nor versioning were successful then return false if
1432 any data reference does not satisfy vect_supportable_dr_alignment.
1434 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1436 Note, Possibility 3 above (which is peeling and versioning together) is not
1437 being done at this time. */
1439 /* (1) Peeling to force alignment. */
1441 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1442 Considerations:
1443 + How many accesses will become aligned due to the peeling
1444 - How many accesses will become unaligned due to the peeling,
1445 and the cost of misaligned accesses.
1446 - The cost of peeling (the extra runtime checks, the increase
1447 in code size). */
1449 FOR_EACH_VEC_ELT (datarefs, i, dr)
1451 stmt = DR_STMT (dr);
1452 stmt_info = vinfo_for_stmt (stmt);
1454 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1455 continue;
1457 /* For interleaving, only the alignment of the first access
1458 matters. */
1459 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1460 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1461 continue;
1463 /* For invariant accesses there is nothing to enhance. */
1464 if (integer_zerop (DR_STEP (dr)))
1465 continue;
1467 /* Strided accesses perform only component accesses, alignment is
1468 irrelevant for them. */
1469 if (STMT_VINFO_STRIDED_P (stmt_info)
1470 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1471 continue;
1473 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1474 do_peeling = vector_alignment_reachable_p (dr);
1475 if (do_peeling)
1477 if (known_alignment_for_access_p (dr))
1479 unsigned int npeel_tmp;
1480 bool negative = tree_int_cst_compare (DR_STEP (dr),
1481 size_zero_node) < 0;
1483 /* Save info about DR in the hash table. */
1484 vectype = STMT_VINFO_VECTYPE (stmt_info);
1485 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1486 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1487 TREE_TYPE (DR_REF (dr))));
1488 npeel_tmp = (negative
1489 ? (mis - nelements) : (nelements - mis))
1490 & (nelements - 1);
1492 /* For multiple types, it is possible that the bigger type access
1493 will have more than one peeling option. E.g., a loop with two
1494 types: one of size (vector size / 4), and the other one of
1495 size (vector size / 8). Vectorization factor will 8. If both
1496 access are misaligned by 3, the first one needs one scalar
1497 iteration to be aligned, and the second one needs 5. But the
1498 first one will be aligned also by peeling 5 scalar
1499 iterations, and in that case both accesses will be aligned.
1500 Hence, except for the immediate peeling amount, we also want
1501 to try to add full vector size, while we don't exceed
1502 vectorization factor.
1503 We do this automtically for cost model, since we calculate cost
1504 for every peeling option. */
1505 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1507 if (STMT_SLP_TYPE (stmt_info))
1508 possible_npeel_number
1509 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1510 else
1511 possible_npeel_number = vf / nelements;
1514 /* Handle the aligned case. We may decide to align some other
1515 access, making DR unaligned. */
1516 if (DR_MISALIGNMENT (dr) == 0)
1518 npeel_tmp = 0;
1519 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1520 possible_npeel_number++;
1523 for (j = 0; j < possible_npeel_number; j++)
1525 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1526 dr, npeel_tmp);
1527 npeel_tmp += nelements;
1530 all_misalignments_unknown = false;
1531 /* Data-ref that was chosen for the case that all the
1532 misalignments are unknown is not relevant anymore, since we
1533 have a data-ref with known alignment. */
1534 dr0 = NULL;
1536 else
1538 /* If we don't know any misalignment values, we prefer
1539 peeling for data-ref that has the maximum number of data-refs
1540 with the same alignment, unless the target prefers to align
1541 stores over load. */
1542 if (all_misalignments_unknown)
1544 unsigned same_align_drs
1545 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1546 if (!dr0
1547 || same_align_drs_max < same_align_drs)
1549 same_align_drs_max = same_align_drs;
1550 dr0 = dr;
1552 /* For data-refs with the same number of related
1553 accesses prefer the one where the misalign
1554 computation will be invariant in the outermost loop. */
1555 else if (same_align_drs_max == same_align_drs)
1557 struct loop *ivloop0, *ivloop;
1558 ivloop0 = outermost_invariant_loop_for_expr
1559 (loop, DR_BASE_ADDRESS (dr0));
1560 ivloop = outermost_invariant_loop_for_expr
1561 (loop, DR_BASE_ADDRESS (dr));
1562 if ((ivloop && !ivloop0)
1563 || (ivloop && ivloop0
1564 && flow_loop_nested_p (ivloop, ivloop0)))
1565 dr0 = dr;
1568 if (!first_store && DR_IS_WRITE (dr))
1569 first_store = dr;
1572 /* If there are both known and unknown misaligned accesses in the
1573 loop, we choose peeling amount according to the known
1574 accesses. */
1575 if (!supportable_dr_alignment)
1577 dr0 = dr;
1578 if (!first_store && DR_IS_WRITE (dr))
1579 first_store = dr;
1583 else
1585 if (!aligned_access_p (dr))
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589 "vector alignment may not be reachable\n");
1590 break;
1595 /* Check if we can possibly peel the loop. */
1596 if (!vect_can_advance_ivs_p (loop_vinfo)
1597 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1598 || loop->inner)
1599 do_peeling = false;
1601 if (do_peeling
1602 && all_misalignments_unknown
1603 && vect_supportable_dr_alignment (dr0, false))
1605 /* Check if the target requires to prefer stores over loads, i.e., if
1606 misaligned stores are more expensive than misaligned loads (taking
1607 drs with same alignment into account). */
1608 if (first_store && DR_IS_READ (dr0))
1610 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1611 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1612 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1613 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1614 stmt_vector_for_cost dummy;
1615 dummy.create (2);
1617 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1618 &dummy);
1619 vect_get_data_access_cost (first_store, &store_inside_cost,
1620 &store_outside_cost, &dummy);
1622 dummy.release ();
1624 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1625 aligning the load DR0). */
1626 load_inside_penalty = store_inside_cost;
1627 load_outside_penalty = store_outside_cost;
1628 for (i = 0;
1629 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1630 DR_STMT (first_store))).iterate (i, &dr);
1631 i++)
1632 if (DR_IS_READ (dr))
1634 load_inside_penalty += load_inside_cost;
1635 load_outside_penalty += load_outside_cost;
1637 else
1639 load_inside_penalty += store_inside_cost;
1640 load_outside_penalty += store_outside_cost;
1643 /* Calculate the penalty for leaving DR0 unaligned (by
1644 aligning the FIRST_STORE). */
1645 store_inside_penalty = load_inside_cost;
1646 store_outside_penalty = load_outside_cost;
1647 for (i = 0;
1648 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1649 DR_STMT (dr0))).iterate (i, &dr);
1650 i++)
1651 if (DR_IS_READ (dr))
1653 store_inside_penalty += load_inside_cost;
1654 store_outside_penalty += load_outside_cost;
1656 else
1658 store_inside_penalty += store_inside_cost;
1659 store_outside_penalty += store_outside_cost;
1662 if (load_inside_penalty > store_inside_penalty
1663 || (load_inside_penalty == store_inside_penalty
1664 && load_outside_penalty > store_outside_penalty))
1665 dr0 = first_store;
1668 /* In case there are only loads with different unknown misalignments, use
1669 peeling only if it may help to align other accesses in the loop or
1670 if it may help improving load bandwith when we'd end up using
1671 unaligned loads. */
1672 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1673 if (!first_store
1674 && !STMT_VINFO_SAME_ALIGN_REFS (
1675 vinfo_for_stmt (DR_STMT (dr0))).length ()
1676 && (vect_supportable_dr_alignment (dr0, false)
1677 != dr_unaligned_supported
1678 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1679 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1680 do_peeling = false;
1683 if (do_peeling && !dr0)
1685 /* Peeling is possible, but there is no data access that is not supported
1686 unless aligned. So we try to choose the best possible peeling. */
1688 /* We should get here only if there are drs with known misalignment. */
1689 gcc_assert (!all_misalignments_unknown);
1691 /* Choose the best peeling from the hash table. */
1692 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1693 loop_vinfo, &npeel,
1694 &body_cost_vec);
1695 if (!dr0 || !npeel)
1696 do_peeling = false;
1699 if (do_peeling)
1701 stmt = DR_STMT (dr0);
1702 stmt_info = vinfo_for_stmt (stmt);
1703 vectype = STMT_VINFO_VECTYPE (stmt_info);
1704 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1706 if (known_alignment_for_access_p (dr0))
1708 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1709 size_zero_node) < 0;
1710 if (!npeel)
1712 /* Since it's known at compile time, compute the number of
1713 iterations in the peeled loop (the peeling factor) for use in
1714 updating DR_MISALIGNMENT values. The peeling factor is the
1715 vectorization factor minus the misalignment as an element
1716 count. */
1717 mis = DR_MISALIGNMENT (dr0);
1718 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1719 npeel = ((negative ? mis - nelements : nelements - mis)
1720 & (nelements - 1));
1723 /* For interleaved data access every iteration accesses all the
1724 members of the group, therefore we divide the number of iterations
1725 by the group size. */
1726 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1727 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1728 npeel /= GROUP_SIZE (stmt_info);
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location,
1732 "Try peeling by %d\n", npeel);
1735 /* Ensure that all data refs can be vectorized after the peel. */
1736 FOR_EACH_VEC_ELT (datarefs, i, dr)
1738 int save_misalignment;
1740 if (dr == dr0)
1741 continue;
1743 stmt = DR_STMT (dr);
1744 stmt_info = vinfo_for_stmt (stmt);
1745 /* For interleaving, only the alignment of the first access
1746 matters. */
1747 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1748 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1749 continue;
1751 /* Strided accesses perform only component accesses, alignment is
1752 irrelevant for them. */
1753 if (STMT_VINFO_STRIDED_P (stmt_info)
1754 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755 continue;
1757 save_misalignment = DR_MISALIGNMENT (dr);
1758 vect_update_misalignment_for_peel (dr, dr0, npeel);
1759 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1760 SET_DR_MISALIGNMENT (dr, save_misalignment);
1762 if (!supportable_dr_alignment)
1764 do_peeling = false;
1765 break;
1769 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1771 stat = vect_verify_datarefs_alignment (loop_vinfo);
1772 if (!stat)
1773 do_peeling = false;
1774 else
1776 body_cost_vec.release ();
1777 return stat;
1781 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1782 if (do_peeling)
1784 unsigned max_allowed_peel
1785 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1786 if (max_allowed_peel != (unsigned)-1)
1788 unsigned max_peel = npeel;
1789 if (max_peel == 0)
1791 gimple *dr_stmt = DR_STMT (dr0);
1792 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1793 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1794 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1796 if (max_peel > max_allowed_peel)
1798 do_peeling = false;
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_NOTE, vect_location,
1801 "Disable peeling, max peels reached: %d\n", max_peel);
1806 /* Cost model #2 - if peeling may result in a remaining loop not
1807 iterating enough to be vectorized then do not peel. */
1808 if (do_peeling
1809 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1811 unsigned max_peel
1812 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1813 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1814 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1815 do_peeling = false;
1818 if (do_peeling)
1820 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1821 If the misalignment of DR_i is identical to that of dr0 then set
1822 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1823 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1824 by the peeling factor times the element size of DR_i (MOD the
1825 vectorization factor times the size). Otherwise, the
1826 misalignment of DR_i must be set to unknown. */
1827 FOR_EACH_VEC_ELT (datarefs, i, dr)
1828 if (dr != dr0)
1830 /* Strided accesses perform only component accesses, alignment
1831 is irrelevant for them. */
1832 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1833 if (STMT_VINFO_STRIDED_P (stmt_info)
1834 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1835 continue;
1837 vect_update_misalignment_for_peel (dr, dr0, npeel);
1840 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1841 if (npeel)
1842 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1843 else
1844 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1845 = DR_MISALIGNMENT (dr0);
1846 SET_DR_MISALIGNMENT (dr0, 0);
1847 if (dump_enabled_p ())
1849 dump_printf_loc (MSG_NOTE, vect_location,
1850 "Alignment of access forced using peeling.\n");
1851 dump_printf_loc (MSG_NOTE, vect_location,
1852 "Peeling for alignment will be applied.\n");
1854 /* The inside-loop cost will be accounted for in vectorizable_load
1855 and vectorizable_store correctly with adjusted alignments.
1856 Drop the body_cst_vec on the floor here. */
1857 body_cost_vec.release ();
1859 stat = vect_verify_datarefs_alignment (loop_vinfo);
1860 gcc_assert (stat);
1861 return stat;
1865 body_cost_vec.release ();
1867 /* (2) Versioning to force alignment. */
1869 /* Try versioning if:
1870 1) optimize loop for speed
1871 2) there is at least one unsupported misaligned data ref with an unknown
1872 misalignment, and
1873 3) all misaligned data refs with a known misalignment are supported, and
1874 4) the number of runtime alignment checks is within reason. */
1876 do_versioning =
1877 optimize_loop_nest_for_speed_p (loop)
1878 && (!loop->inner); /* FORNOW */
1880 if (do_versioning)
1882 FOR_EACH_VEC_ELT (datarefs, i, dr)
1884 stmt = DR_STMT (dr);
1885 stmt_info = vinfo_for_stmt (stmt);
1887 /* For interleaving, only the alignment of the first access
1888 matters. */
1889 if (aligned_access_p (dr)
1890 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1891 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1892 continue;
1894 if (STMT_VINFO_STRIDED_P (stmt_info))
1896 /* Strided loads perform only component accesses, alignment is
1897 irrelevant for them. */
1898 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1899 continue;
1900 do_versioning = false;
1901 break;
1904 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1906 if (!supportable_dr_alignment)
1908 gimple *stmt;
1909 int mask;
1910 tree vectype;
1912 if (known_alignment_for_access_p (dr)
1913 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1914 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1916 do_versioning = false;
1917 break;
1920 stmt = DR_STMT (dr);
1921 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1922 gcc_assert (vectype);
1924 /* The rightmost bits of an aligned address must be zeros.
1925 Construct the mask needed for this test. For example,
1926 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1927 mask must be 15 = 0xf. */
1928 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1930 /* FORNOW: use the same mask to test all potentially unaligned
1931 references in the loop. The vectorizer currently supports
1932 a single vector size, see the reference to
1933 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1934 vectorization factor is computed. */
1935 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1936 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1937 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1938 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1939 DR_STMT (dr));
1943 /* Versioning requires at least one misaligned data reference. */
1944 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1945 do_versioning = false;
1946 else if (!do_versioning)
1947 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1950 if (do_versioning)
1952 vec<gimple *> may_misalign_stmts
1953 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1954 gimple *stmt;
1956 /* It can now be assumed that the data references in the statements
1957 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1958 of the loop being vectorized. */
1959 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1961 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1962 dr = STMT_VINFO_DATA_REF (stmt_info);
1963 SET_DR_MISALIGNMENT (dr, 0);
1964 if (dump_enabled_p ())
1965 dump_printf_loc (MSG_NOTE, vect_location,
1966 "Alignment of access forced using versioning.\n");
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_NOTE, vect_location,
1971 "Versioning for alignment will be applied.\n");
1973 /* Peeling and versioning can't be done together at this time. */
1974 gcc_assert (! (do_peeling && do_versioning));
1976 stat = vect_verify_datarefs_alignment (loop_vinfo);
1977 gcc_assert (stat);
1978 return stat;
1981 /* This point is reached if neither peeling nor versioning is being done. */
1982 gcc_assert (! (do_peeling || do_versioning));
1984 stat = vect_verify_datarefs_alignment (loop_vinfo);
1985 return stat;
1989 /* Function vect_find_same_alignment_drs.
1991 Update group and alignment relations according to the chosen
1992 vectorization factor. */
1994 static void
1995 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1996 loop_vec_info loop_vinfo)
1998 unsigned int i;
1999 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2000 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001 struct data_reference *dra = DDR_A (ddr);
2002 struct data_reference *drb = DDR_B (ddr);
2003 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2004 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2005 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2006 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2007 lambda_vector dist_v;
2008 unsigned int loop_depth;
2010 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2011 return;
2013 if (dra == drb)
2014 return;
2016 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2017 return;
2019 /* Loop-based vectorization and known data dependence. */
2020 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2021 return;
2023 /* Data-dependence analysis reports a distance vector of zero
2024 for data-references that overlap only in the first iteration
2025 but have different sign step (see PR45764).
2026 So as a sanity check require equal DR_STEP. */
2027 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2028 return;
2030 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2031 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2033 int dist = dist_v[loop_depth];
2035 if (dump_enabled_p ())
2036 dump_printf_loc (MSG_NOTE, vect_location,
2037 "dependence distance = %d.\n", dist);
2039 /* Same loop iteration. */
2040 if (dist == 0
2041 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2043 /* Two references with distance zero have the same alignment. */
2044 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2045 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2046 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "accesses have the same alignment.\n");
2050 dump_printf (MSG_NOTE,
2051 "dependence distance modulo vf == 0 between ");
2052 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2053 dump_printf (MSG_NOTE, " and ");
2054 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2055 dump_printf (MSG_NOTE, "\n");
2062 /* Function vect_analyze_data_refs_alignment
2064 Analyze the alignment of the data-references in the loop.
2065 Return FALSE if a data reference is found that cannot be vectorized. */
2067 bool
2068 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2070 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_NOTE, vect_location,
2072 "=== vect_analyze_data_refs_alignment ===\n");
2074 /* Mark groups of data references with same alignment using
2075 data dependence information. */
2076 vec<ddr_p> ddrs = vinfo->ddrs;
2077 struct data_dependence_relation *ddr;
2078 unsigned int i;
2080 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2081 vect_find_same_alignment_drs (ddr, vinfo);
2083 vec<data_reference_p> datarefs = vinfo->datarefs;
2084 struct data_reference *dr;
2086 FOR_EACH_VEC_ELT (datarefs, i, dr)
2088 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2089 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2090 && !vect_compute_data_ref_alignment (dr))
2092 /* Strided accesses perform only component accesses, misalignment
2093 information is irrelevant for them. */
2094 if (STMT_VINFO_STRIDED_P (stmt_info)
2095 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2096 continue;
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 "not vectorized: can't calculate alignment "
2101 "for data ref.\n");
2103 return false;
2107 return true;
2111 /* Analyze alignment of DRs of stmts in NODE. */
2113 static bool
2114 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2116 /* We vectorize from the first scalar stmt in the node unless
2117 the node is permuted in which case we start from the first
2118 element in the group. */
2119 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2120 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2121 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2122 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2124 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2125 if (! vect_compute_data_ref_alignment (dr)
2126 /* For creating the data-ref pointer we need alignment of the
2127 first element anyway. */
2128 || (dr != first_dr
2129 && ! vect_compute_data_ref_alignment (first_dr))
2130 || ! verify_data_ref_alignment (dr))
2132 if (dump_enabled_p ())
2133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2134 "not vectorized: bad data alignment in basic "
2135 "block.\n");
2136 return false;
2139 return true;
2142 /* Function vect_slp_analyze_instance_alignment
2144 Analyze the alignment of the data-references in the SLP instance.
2145 Return FALSE if a data reference is found that cannot be vectorized. */
2147 bool
2148 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_NOTE, vect_location,
2152 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2154 slp_tree node;
2155 unsigned i;
2156 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2157 if (! vect_slp_analyze_and_verify_node_alignment (node))
2158 return false;
2160 node = SLP_INSTANCE_TREE (instance);
2161 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2162 && ! vect_slp_analyze_and_verify_node_alignment
2163 (SLP_INSTANCE_TREE (instance)))
2164 return false;
2166 return true;
2170 /* Analyze groups of accesses: check that DR belongs to a group of
2171 accesses of legal size, step, etc. Detect gaps, single element
2172 interleaving, and other special cases. Set grouped access info.
2173 Collect groups of strided stores for further use in SLP analysis.
2174 Worker for vect_analyze_group_access. */
2176 static bool
2177 vect_analyze_group_access_1 (struct data_reference *dr)
2179 tree step = DR_STEP (dr);
2180 tree scalar_type = TREE_TYPE (DR_REF (dr));
2181 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2182 gimple *stmt = DR_STMT (dr);
2183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2184 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2185 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2186 HOST_WIDE_INT dr_step = -1;
2187 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2188 bool slp_impossible = false;
2190 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2191 size of the interleaving group (including gaps). */
2192 if (tree_fits_shwi_p (step))
2194 dr_step = tree_to_shwi (step);
2195 /* Check that STEP is a multiple of type size. Otherwise there is
2196 a non-element-sized gap at the end of the group which we
2197 cannot represent in GROUP_GAP or GROUP_SIZE.
2198 ??? As we can handle non-constant step fine here we should
2199 simply remove uses of GROUP_GAP between the last and first
2200 element and instead rely on DR_STEP. GROUP_SIZE then would
2201 simply not include that gap. */
2202 if ((dr_step % type_size) != 0)
2204 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_NOTE, vect_location,
2207 "Step ");
2208 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2209 dump_printf (MSG_NOTE,
2210 " is not a multiple of the element size for ");
2211 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2212 dump_printf (MSG_NOTE, "\n");
2214 return false;
2216 groupsize = absu_hwi (dr_step) / type_size;
2218 else
2219 groupsize = 0;
2221 /* Not consecutive access is possible only if it is a part of interleaving. */
2222 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2224 /* Check if it this DR is a part of interleaving, and is a single
2225 element of the group that is accessed in the loop. */
2227 /* Gaps are supported only for loads. STEP must be a multiple of the type
2228 size. The size of the group must be a power of 2. */
2229 if (DR_IS_READ (dr)
2230 && (dr_step % type_size) == 0
2231 && groupsize > 0
2232 && exact_log2 (groupsize) != -1)
2234 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2235 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2236 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_NOTE, vect_location,
2239 "Detected single element interleaving ");
2240 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2241 dump_printf (MSG_NOTE, " step ");
2242 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2243 dump_printf (MSG_NOTE, "\n");
2246 return true;
2249 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 "not consecutive access ");
2253 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2256 if (bb_vinfo)
2258 /* Mark the statement as unvectorizable. */
2259 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2260 return true;
2263 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2264 STMT_VINFO_STRIDED_P (stmt_info) = true;
2265 return true;
2268 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2270 /* First stmt in the interleaving chain. Check the chain. */
2271 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2272 struct data_reference *data_ref = dr;
2273 unsigned int count = 1;
2274 tree prev_init = DR_INIT (data_ref);
2275 gimple *prev = stmt;
2276 HOST_WIDE_INT diff, gaps = 0;
2278 while (next)
2280 /* Skip same data-refs. In case that two or more stmts share
2281 data-ref (supported only for loads), we vectorize only the first
2282 stmt, and the rest get their vectorized loads from the first
2283 one. */
2284 if (!tree_int_cst_compare (DR_INIT (data_ref),
2285 DR_INIT (STMT_VINFO_DATA_REF (
2286 vinfo_for_stmt (next)))))
2288 if (DR_IS_WRITE (data_ref))
2290 if (dump_enabled_p ())
2291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2292 "Two store stmts share the same dr.\n");
2293 return false;
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "Two or more load stmts share the same dr.\n");
2300 /* For load use the same data-ref load. */
2301 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2303 prev = next;
2304 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2305 continue;
2308 prev = next;
2309 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2311 /* All group members have the same STEP by construction. */
2312 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2314 /* Check that the distance between two accesses is equal to the type
2315 size. Otherwise, we have gaps. */
2316 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2317 - TREE_INT_CST_LOW (prev_init)) / type_size;
2318 if (diff != 1)
2320 /* FORNOW: SLP of accesses with gaps is not supported. */
2321 slp_impossible = true;
2322 if (DR_IS_WRITE (data_ref))
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326 "interleaved store with gaps\n");
2327 return false;
2330 gaps += diff - 1;
2333 last_accessed_element += diff;
2335 /* Store the gap from the previous member of the group. If there is no
2336 gap in the access, GROUP_GAP is always 1. */
2337 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2339 prev_init = DR_INIT (data_ref);
2340 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2341 /* Count the number of data-refs in the chain. */
2342 count++;
2345 if (groupsize == 0)
2346 groupsize = count + gaps;
2348 if (groupsize > UINT_MAX)
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352 "group is too large\n");
2353 return false;
2356 /* Check that the size of the interleaving is equal to count for stores,
2357 i.e., that there are no gaps. */
2358 if (groupsize != count
2359 && !DR_IS_READ (dr))
2361 if (dump_enabled_p ())
2362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363 "interleaved store with gaps\n");
2364 return false;
2367 /* If there is a gap after the last load in the group it is the
2368 difference between the groupsize and the last accessed
2369 element.
2370 When there is no gap, this difference should be 0. */
2371 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2373 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2374 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_NOTE, vect_location,
2377 "Detected interleaving ");
2378 if (DR_IS_READ (dr))
2379 dump_printf (MSG_NOTE, "load ");
2380 else
2381 dump_printf (MSG_NOTE, "store ");
2382 dump_printf (MSG_NOTE, "of size %u starting with ",
2383 (unsigned)groupsize);
2384 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2385 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2386 dump_printf_loc (MSG_NOTE, vect_location,
2387 "There is a gap of %u elements after the group\n",
2388 GROUP_GAP (vinfo_for_stmt (stmt)));
2391 /* SLP: create an SLP data structure for every interleaving group of
2392 stores for further analysis in vect_analyse_slp. */
2393 if (DR_IS_WRITE (dr) && !slp_impossible)
2395 if (loop_vinfo)
2396 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2397 if (bb_vinfo)
2398 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2402 return true;
2405 /* Analyze groups of accesses: check that DR belongs to a group of
2406 accesses of legal size, step, etc. Detect gaps, single element
2407 interleaving, and other special cases. Set grouped access info.
2408 Collect groups of strided stores for further use in SLP analysis. */
2410 static bool
2411 vect_analyze_group_access (struct data_reference *dr)
2413 if (!vect_analyze_group_access_1 (dr))
2415 /* Dissolve the group if present. */
2416 gimple *next;
2417 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2418 while (stmt)
2420 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2421 next = GROUP_NEXT_ELEMENT (vinfo);
2422 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2423 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2424 stmt = next;
2426 return false;
2428 return true;
2431 /* Analyze the access pattern of the data-reference DR.
2432 In case of non-consecutive accesses call vect_analyze_group_access() to
2433 analyze groups of accesses. */
2435 static bool
2436 vect_analyze_data_ref_access (struct data_reference *dr)
2438 tree step = DR_STEP (dr);
2439 tree scalar_type = TREE_TYPE (DR_REF (dr));
2440 gimple *stmt = DR_STMT (dr);
2441 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2442 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2443 struct loop *loop = NULL;
2445 if (loop_vinfo)
2446 loop = LOOP_VINFO_LOOP (loop_vinfo);
2448 if (loop_vinfo && !step)
2450 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452 "bad data-ref access in loop\n");
2453 return false;
2456 /* Allow loads with zero step in inner-loop vectorization. */
2457 if (loop_vinfo && integer_zerop (step))
2459 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2460 if (!nested_in_vect_loop_p (loop, stmt))
2461 return DR_IS_READ (dr);
2462 /* Allow references with zero step for outer loops marked
2463 with pragma omp simd only - it guarantees absence of
2464 loop-carried dependencies between inner loop iterations. */
2465 if (!loop->force_vectorize)
2467 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_NOTE, vect_location,
2469 "zero step in inner loop of nest\n");
2470 return false;
2474 if (loop && nested_in_vect_loop_p (loop, stmt))
2476 /* Interleaved accesses are not yet supported within outer-loop
2477 vectorization for references in the inner-loop. */
2478 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2480 /* For the rest of the analysis we use the outer-loop step. */
2481 step = STMT_VINFO_DR_STEP (stmt_info);
2482 if (integer_zerop (step))
2484 if (dump_enabled_p ())
2485 dump_printf_loc (MSG_NOTE, vect_location,
2486 "zero step in outer loop.\n");
2487 return DR_IS_READ (dr);
2491 /* Consecutive? */
2492 if (TREE_CODE (step) == INTEGER_CST)
2494 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2495 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2496 || (dr_step < 0
2497 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2499 /* Mark that it is not interleaving. */
2500 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2501 return true;
2505 if (loop && nested_in_vect_loop_p (loop, stmt))
2507 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "grouped access in outer loop.\n");
2510 return false;
2514 /* Assume this is a DR handled by non-constant strided load case. */
2515 if (TREE_CODE (step) != INTEGER_CST)
2516 return (STMT_VINFO_STRIDED_P (stmt_info)
2517 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2518 || vect_analyze_group_access (dr)));
2520 /* Not consecutive access - check if it's a part of interleaving group. */
2521 return vect_analyze_group_access (dr);
2526 /* A helper function used in the comparator function to sort data
2527 references. T1 and T2 are two data references to be compared.
2528 The function returns -1, 0, or 1. */
2530 static int
2531 compare_tree (tree t1, tree t2)
2533 int i, cmp;
2534 enum tree_code code;
2535 char tclass;
2537 if (t1 == t2)
2538 return 0;
2539 if (t1 == NULL)
2540 return -1;
2541 if (t2 == NULL)
2542 return 1;
2544 STRIP_NOPS (t1);
2545 STRIP_NOPS (t2);
2547 if (TREE_CODE (t1) != TREE_CODE (t2))
2548 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2550 code = TREE_CODE (t1);
2551 switch (code)
2553 /* For const values, we can just use hash values for comparisons. */
2554 case INTEGER_CST:
2555 case REAL_CST:
2556 case FIXED_CST:
2557 case STRING_CST:
2558 case COMPLEX_CST:
2559 case VECTOR_CST:
2561 hashval_t h1 = iterative_hash_expr (t1, 0);
2562 hashval_t h2 = iterative_hash_expr (t2, 0);
2563 if (h1 != h2)
2564 return h1 < h2 ? -1 : 1;
2565 break;
2568 case SSA_NAME:
2569 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2570 if (cmp != 0)
2571 return cmp;
2573 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2574 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2575 break;
2577 default:
2578 tclass = TREE_CODE_CLASS (code);
2580 /* For var-decl, we could compare their UIDs. */
2581 if (tclass == tcc_declaration)
2583 if (DECL_UID (t1) != DECL_UID (t2))
2584 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2585 break;
2588 /* For expressions with operands, compare their operands recursively. */
2589 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2591 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2592 if (cmp != 0)
2593 return cmp;
2597 return 0;
2601 /* Compare two data-references DRA and DRB to group them into chunks
2602 suitable for grouping. */
2604 static int
2605 dr_group_sort_cmp (const void *dra_, const void *drb_)
2607 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2608 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2609 int cmp;
2611 /* Stabilize sort. */
2612 if (dra == drb)
2613 return 0;
2615 /* DRs in different loops never belong to the same group. */
2616 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2617 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2618 if (loopa != loopb)
2619 return loopa->num < loopb->num ? -1 : 1;
2621 /* Ordering of DRs according to base. */
2622 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2624 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2625 if (cmp != 0)
2626 return cmp;
2629 /* And according to DR_OFFSET. */
2630 if (!dr_equal_offsets_p (dra, drb))
2632 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2633 if (cmp != 0)
2634 return cmp;
2637 /* Put reads before writes. */
2638 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2639 return DR_IS_READ (dra) ? -1 : 1;
2641 /* Then sort after access size. */
2642 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2643 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2645 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2646 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2647 if (cmp != 0)
2648 return cmp;
2651 /* And after step. */
2652 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2654 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2655 if (cmp != 0)
2656 return cmp;
2659 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2660 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2661 if (cmp == 0)
2662 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2663 return cmp;
2666 /* Function vect_analyze_data_ref_accesses.
2668 Analyze the access pattern of all the data references in the loop.
2670 FORNOW: the only access pattern that is considered vectorizable is a
2671 simple step 1 (consecutive) access.
2673 FORNOW: handle only arrays and pointer accesses. */
2675 bool
2676 vect_analyze_data_ref_accesses (vec_info *vinfo)
2678 unsigned int i;
2679 vec<data_reference_p> datarefs = vinfo->datarefs;
2680 struct data_reference *dr;
2682 if (dump_enabled_p ())
2683 dump_printf_loc (MSG_NOTE, vect_location,
2684 "=== vect_analyze_data_ref_accesses ===\n");
2686 if (datarefs.is_empty ())
2687 return true;
2689 /* Sort the array of datarefs to make building the interleaving chains
2690 linear. Don't modify the original vector's order, it is needed for
2691 determining what dependencies are reversed. */
2692 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2693 datarefs_copy.qsort (dr_group_sort_cmp);
2695 /* Build the interleaving chains. */
2696 for (i = 0; i < datarefs_copy.length () - 1;)
2698 data_reference_p dra = datarefs_copy[i];
2699 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2700 stmt_vec_info lastinfo = NULL;
2701 for (i = i + 1; i < datarefs_copy.length (); ++i)
2703 data_reference_p drb = datarefs_copy[i];
2704 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2706 /* ??? Imperfect sorting (non-compatible types, non-modulo
2707 accesses, same accesses) can lead to a group to be artificially
2708 split here as we don't just skip over those. If it really
2709 matters we can push those to a worklist and re-iterate
2710 over them. The we can just skip ahead to the next DR here. */
2712 /* DRs in a different loop should not be put into the same
2713 interleaving group. */
2714 if (gimple_bb (DR_STMT (dra))->loop_father
2715 != gimple_bb (DR_STMT (drb))->loop_father)
2716 break;
2718 /* Check that the data-refs have same first location (except init)
2719 and they are both either store or load (not load and store,
2720 not masked loads or stores). */
2721 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2722 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2723 DR_BASE_ADDRESS (drb), 0)
2724 || !dr_equal_offsets_p (dra, drb)
2725 || !gimple_assign_single_p (DR_STMT (dra))
2726 || !gimple_assign_single_p (DR_STMT (drb)))
2727 break;
2729 /* Check that the data-refs have the same constant size. */
2730 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2731 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2732 if (!tree_fits_uhwi_p (sza)
2733 || !tree_fits_uhwi_p (szb)
2734 || !tree_int_cst_equal (sza, szb))
2735 break;
2737 /* Check that the data-refs have the same step. */
2738 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2739 break;
2741 /* Do not place the same access in the interleaving chain twice. */
2742 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2743 break;
2745 /* Check the types are compatible.
2746 ??? We don't distinguish this during sorting. */
2747 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2748 TREE_TYPE (DR_REF (drb))))
2749 break;
2751 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2752 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2753 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2754 gcc_assert (init_a < init_b);
2756 /* If init_b == init_a + the size of the type * k, we have an
2757 interleaving, and DRA is accessed before DRB. */
2758 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2759 if (type_size_a == 0
2760 || (init_b - init_a) % type_size_a != 0)
2761 break;
2763 /* If we have a store, the accesses are adjacent. This splits
2764 groups into chunks we support (we don't support vectorization
2765 of stores with gaps). */
2766 if (!DR_IS_READ (dra)
2767 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2768 (DR_INIT (datarefs_copy[i-1]))
2769 != type_size_a))
2770 break;
2772 /* If the step (if not zero or non-constant) is greater than the
2773 difference between data-refs' inits this splits groups into
2774 suitable sizes. */
2775 if (tree_fits_shwi_p (DR_STEP (dra)))
2777 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2778 if (step != 0 && step <= (init_b - init_a))
2779 break;
2782 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE, vect_location,
2785 "Detected interleaving ");
2786 if (DR_IS_READ (dra))
2787 dump_printf (MSG_NOTE, "load ");
2788 else
2789 dump_printf (MSG_NOTE, "store ");
2790 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2791 dump_printf (MSG_NOTE, " and ");
2792 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2793 dump_printf (MSG_NOTE, "\n");
2796 /* Link the found element into the group list. */
2797 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2799 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2800 lastinfo = stmtinfo_a;
2802 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2803 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2804 lastinfo = stmtinfo_b;
2808 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2809 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2810 && !vect_analyze_data_ref_access (dr))
2812 if (dump_enabled_p ())
2813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2814 "not vectorized: complicated access pattern.\n");
2816 if (is_a <bb_vec_info> (vinfo))
2818 /* Mark the statement as not vectorizable. */
2819 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2820 continue;
2822 else
2824 datarefs_copy.release ();
2825 return false;
2829 datarefs_copy.release ();
2830 return true;
2834 /* Operator == between two dr_with_seg_len objects.
2836 This equality operator is used to make sure two data refs
2837 are the same one so that we will consider to combine the
2838 aliasing checks of those two pairs of data dependent data
2839 refs. */
2841 static bool
2842 operator == (const dr_with_seg_len& d1,
2843 const dr_with_seg_len& d2)
2845 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2846 DR_BASE_ADDRESS (d2.dr), 0)
2847 && compare_tree (d1.offset, d2.offset) == 0
2848 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2851 /* Function comp_dr_with_seg_len_pair.
2853 Comparison function for sorting objects of dr_with_seg_len_pair_t
2854 so that we can combine aliasing checks in one scan. */
2856 static int
2857 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2859 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2860 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2862 const dr_with_seg_len &p11 = p1->first,
2863 &p12 = p1->second,
2864 &p21 = p2->first,
2865 &p22 = p2->second;
2867 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2868 if a and c have the same basic address snd step, and b and d have the same
2869 address and step. Therefore, if any a&c or b&d don't have the same address
2870 and step, we don't care the order of those two pairs after sorting. */
2871 int comp_res;
2873 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2874 DR_BASE_ADDRESS (p21.dr))) != 0)
2875 return comp_res;
2876 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2877 DR_BASE_ADDRESS (p22.dr))) != 0)
2878 return comp_res;
2879 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2880 return comp_res;
2881 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2882 return comp_res;
2883 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2884 return comp_res;
2885 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2886 return comp_res;
2888 return 0;
2891 /* Function vect_vfa_segment_size.
2893 Create an expression that computes the size of segment
2894 that will be accessed for a data reference. The functions takes into
2895 account that realignment loads may access one more vector.
2897 Input:
2898 DR: The data reference.
2899 LENGTH_FACTOR: segment length to consider.
2901 Return an expression whose value is the size of segment which will be
2902 accessed by DR. */
2904 static tree
2905 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2907 tree segment_length;
2909 if (integer_zerop (DR_STEP (dr)))
2910 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2911 else
2912 segment_length = size_binop (MULT_EXPR,
2913 fold_convert (sizetype, DR_STEP (dr)),
2914 fold_convert (sizetype, length_factor));
2916 if (vect_supportable_dr_alignment (dr, false)
2917 == dr_explicit_realign_optimized)
2919 tree vector_size = TYPE_SIZE_UNIT
2920 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2922 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2924 return segment_length;
2927 /* Function vect_prune_runtime_alias_test_list.
2929 Prune a list of ddrs to be tested at run-time by versioning for alias.
2930 Merge several alias checks into one if possible.
2931 Return FALSE if resulting list of ddrs is longer then allowed by
2932 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2934 bool
2935 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2937 vec<ddr_p> may_alias_ddrs =
2938 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2939 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2940 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2941 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2942 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2944 ddr_p ddr;
2945 unsigned int i;
2946 tree length_factor;
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location,
2950 "=== vect_prune_runtime_alias_test_list ===\n");
2952 if (may_alias_ddrs.is_empty ())
2953 return true;
2955 /* Basically, for each pair of dependent data refs store_ptr_0
2956 and load_ptr_0, we create an expression:
2958 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2959 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2961 for aliasing checks. However, in some cases we can decrease
2962 the number of checks by combining two checks into one. For
2963 example, suppose we have another pair of data refs store_ptr_0
2964 and load_ptr_1, and if the following condition is satisfied:
2966 load_ptr_0 < load_ptr_1 &&
2967 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2969 (this condition means, in each iteration of vectorized loop,
2970 the accessed memory of store_ptr_0 cannot be between the memory
2971 of load_ptr_0 and load_ptr_1.)
2973 we then can use only the following expression to finish the
2974 alising checks between store_ptr_0 & load_ptr_0 and
2975 store_ptr_0 & load_ptr_1:
2977 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2978 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2980 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2981 same basic address. */
2983 comp_alias_ddrs.create (may_alias_ddrs.length ());
2985 /* First, we collect all data ref pairs for aliasing checks. */
2986 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2988 struct data_reference *dr_a, *dr_b;
2989 gimple *dr_group_first_a, *dr_group_first_b;
2990 tree segment_length_a, segment_length_b;
2991 gimple *stmt_a, *stmt_b;
2993 dr_a = DDR_A (ddr);
2994 stmt_a = DR_STMT (DDR_A (ddr));
2995 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2996 if (dr_group_first_a)
2998 stmt_a = dr_group_first_a;
2999 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3002 dr_b = DDR_B (ddr);
3003 stmt_b = DR_STMT (DDR_B (ddr));
3004 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3005 if (dr_group_first_b)
3007 stmt_b = dr_group_first_b;
3008 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3011 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3012 length_factor = scalar_loop_iters;
3013 else
3014 length_factor = size_int (vect_factor);
3015 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3016 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3018 dr_with_seg_len_pair_t dr_with_seg_len_pair
3019 (dr_with_seg_len (dr_a, segment_length_a),
3020 dr_with_seg_len (dr_b, segment_length_b));
3022 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3023 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3025 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3028 /* Second, we sort the collected data ref pairs so that we can scan
3029 them once to combine all possible aliasing checks. */
3030 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3032 /* Third, we scan the sorted dr pairs and check if we can combine
3033 alias checks of two neighboring dr pairs. */
3034 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3036 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3037 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3038 *dr_b1 = &comp_alias_ddrs[i-1].second,
3039 *dr_a2 = &comp_alias_ddrs[i].first,
3040 *dr_b2 = &comp_alias_ddrs[i].second;
3042 /* Remove duplicate data ref pairs. */
3043 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3045 if (dump_enabled_p ())
3047 dump_printf_loc (MSG_NOTE, vect_location,
3048 "found equal ranges ");
3049 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3050 DR_REF (dr_a1->dr));
3051 dump_printf (MSG_NOTE, ", ");
3052 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3053 DR_REF (dr_b1->dr));
3054 dump_printf (MSG_NOTE, " and ");
3055 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3056 DR_REF (dr_a2->dr));
3057 dump_printf (MSG_NOTE, ", ");
3058 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3059 DR_REF (dr_b2->dr));
3060 dump_printf (MSG_NOTE, "\n");
3063 comp_alias_ddrs.ordered_remove (i--);
3064 continue;
3067 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3069 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3070 and DR_A1 and DR_A2 are two consecutive memrefs. */
3071 if (*dr_a1 == *dr_a2)
3073 std::swap (dr_a1, dr_b1);
3074 std::swap (dr_a2, dr_b2);
3077 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3078 DR_BASE_ADDRESS (dr_a2->dr),
3080 || !tree_fits_shwi_p (dr_a1->offset)
3081 || !tree_fits_shwi_p (dr_a2->offset))
3082 continue;
3084 /* Make sure dr_a1 starts left of dr_a2. */
3085 if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset))
3086 std::swap (*dr_a1, *dr_a2);
3088 unsigned HOST_WIDE_INT diff
3089 = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset);
3092 bool do_remove = false;
3094 /* If the left segment does not extend beyond the start of the
3095 right segment the new segment length is that of the right
3096 plus the segment distance. */
3097 if (tree_fits_uhwi_p (dr_a1->seg_len)
3098 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3100 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3101 size_int (diff));
3102 do_remove = true;
3104 /* Generally the new segment length is the maximum of the
3105 left segment size and the right segment size plus the distance.
3106 ??? We can also build tree MAX_EXPR here but it's not clear this
3107 is profitable. */
3108 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3109 && tree_fits_uhwi_p (dr_a2->seg_len))
3111 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3112 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3113 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3114 do_remove = true;
3116 /* Now we check if the following condition is satisfied:
3118 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3120 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3121 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3122 have to make a best estimation. We can get the minimum value
3123 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3124 then either of the following two conditions can guarantee the
3125 one above:
3127 1: DIFF <= MIN_SEG_LEN_B
3128 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3129 else
3131 unsigned HOST_WIDE_INT min_seg_len_b
3132 = (tree_fits_uhwi_p (dr_b1->seg_len)
3133 ? tree_to_uhwi (dr_b1->seg_len)
3134 : vect_factor);
3136 if (diff <= min_seg_len_b
3137 || (tree_fits_uhwi_p (dr_a1->seg_len)
3138 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3140 dr_a1->seg_len = size_binop (PLUS_EXPR,
3141 dr_a2->seg_len, size_int (diff));
3142 do_remove = true;
3146 if (do_remove)
3148 if (dump_enabled_p ())
3150 dump_printf_loc (MSG_NOTE, vect_location,
3151 "merging ranges for ");
3152 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3153 dump_printf (MSG_NOTE, ", ");
3154 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3155 dump_printf (MSG_NOTE, " and ");
3156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3157 dump_printf (MSG_NOTE, ", ");
3158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3159 dump_printf (MSG_NOTE, "\n");
3161 comp_alias_ddrs.ordered_remove (i--);
3166 dump_printf_loc (MSG_NOTE, vect_location,
3167 "improved number of alias checks from %d to %d\n",
3168 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3169 if ((int) comp_alias_ddrs.length () >
3170 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3171 return false;
3173 return true;
3176 /* Check whether a non-affine read or write in stmt is suitable for gather load
3177 or scatter store and if so, return a builtin decl for that operation. */
3179 tree
3180 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3181 tree *offp, int *scalep)
3183 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3184 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3185 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3186 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3187 tree offtype = NULL_TREE;
3188 tree decl, base, off;
3189 machine_mode pmode;
3190 int punsignedp, reversep, pvolatilep = 0;
3192 base = DR_REF (dr);
3193 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3194 see if we can use the def stmt of the address. */
3195 if (is_gimple_call (stmt)
3196 && gimple_call_internal_p (stmt)
3197 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3198 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3199 && TREE_CODE (base) == MEM_REF
3200 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3201 && integer_zerop (TREE_OPERAND (base, 1))
3202 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3204 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3205 if (is_gimple_assign (def_stmt)
3206 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3207 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3210 /* The gather and scatter builtins need address of the form
3211 loop_invariant + vector * {1, 2, 4, 8}
3213 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3214 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3215 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3216 multiplications and additions in it. To get a vector, we need
3217 a single SSA_NAME that will be defined in the loop and will
3218 contain everything that is not loop invariant and that can be
3219 vectorized. The following code attempts to find such a preexistng
3220 SSA_NAME OFF and put the loop invariants into a tree BASE
3221 that can be gimplified before the loop. */
3222 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3223 &punsignedp, &reversep, &pvolatilep, false);
3224 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3226 if (TREE_CODE (base) == MEM_REF)
3228 if (!integer_zerop (TREE_OPERAND (base, 1)))
3230 if (off == NULL_TREE)
3232 offset_int moff = mem_ref_offset (base);
3233 off = wide_int_to_tree (sizetype, moff);
3235 else
3236 off = size_binop (PLUS_EXPR, off,
3237 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3239 base = TREE_OPERAND (base, 0);
3241 else
3242 base = build_fold_addr_expr (base);
3244 if (off == NULL_TREE)
3245 off = size_zero_node;
3247 /* If base is not loop invariant, either off is 0, then we start with just
3248 the constant offset in the loop invariant BASE and continue with base
3249 as OFF, otherwise give up.
3250 We could handle that case by gimplifying the addition of base + off
3251 into some SSA_NAME and use that as off, but for now punt. */
3252 if (!expr_invariant_in_loop_p (loop, base))
3254 if (!integer_zerop (off))
3255 return NULL_TREE;
3256 off = base;
3257 base = size_int (pbitpos / BITS_PER_UNIT);
3259 /* Otherwise put base + constant offset into the loop invariant BASE
3260 and continue with OFF. */
3261 else
3263 base = fold_convert (sizetype, base);
3264 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3267 /* OFF at this point may be either a SSA_NAME or some tree expression
3268 from get_inner_reference. Try to peel off loop invariants from it
3269 into BASE as long as possible. */
3270 STRIP_NOPS (off);
3271 while (offtype == NULL_TREE)
3273 enum tree_code code;
3274 tree op0, op1, add = NULL_TREE;
3276 if (TREE_CODE (off) == SSA_NAME)
3278 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3280 if (expr_invariant_in_loop_p (loop, off))
3281 return NULL_TREE;
3283 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3284 break;
3286 op0 = gimple_assign_rhs1 (def_stmt);
3287 code = gimple_assign_rhs_code (def_stmt);
3288 op1 = gimple_assign_rhs2 (def_stmt);
3290 else
3292 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3293 return NULL_TREE;
3294 code = TREE_CODE (off);
3295 extract_ops_from_tree (off, &code, &op0, &op1);
3297 switch (code)
3299 case POINTER_PLUS_EXPR:
3300 case PLUS_EXPR:
3301 if (expr_invariant_in_loop_p (loop, op0))
3303 add = op0;
3304 off = op1;
3305 do_add:
3306 add = fold_convert (sizetype, add);
3307 if (scale != 1)
3308 add = size_binop (MULT_EXPR, add, size_int (scale));
3309 base = size_binop (PLUS_EXPR, base, add);
3310 continue;
3312 if (expr_invariant_in_loop_p (loop, op1))
3314 add = op1;
3315 off = op0;
3316 goto do_add;
3318 break;
3319 case MINUS_EXPR:
3320 if (expr_invariant_in_loop_p (loop, op1))
3322 add = fold_convert (sizetype, op1);
3323 add = size_binop (MINUS_EXPR, size_zero_node, add);
3324 off = op0;
3325 goto do_add;
3327 break;
3328 case MULT_EXPR:
3329 if (scale == 1 && tree_fits_shwi_p (op1))
3331 scale = tree_to_shwi (op1);
3332 off = op0;
3333 continue;
3335 break;
3336 case SSA_NAME:
3337 off = op0;
3338 continue;
3339 CASE_CONVERT:
3340 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3341 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3342 break;
3343 if (TYPE_PRECISION (TREE_TYPE (op0))
3344 == TYPE_PRECISION (TREE_TYPE (off)))
3346 off = op0;
3347 continue;
3349 if (TYPE_PRECISION (TREE_TYPE (op0))
3350 < TYPE_PRECISION (TREE_TYPE (off)))
3352 off = op0;
3353 offtype = TREE_TYPE (off);
3354 STRIP_NOPS (off);
3355 continue;
3357 break;
3358 default:
3359 break;
3361 break;
3364 /* If at the end OFF still isn't a SSA_NAME or isn't
3365 defined in the loop, punt. */
3366 if (TREE_CODE (off) != SSA_NAME
3367 || expr_invariant_in_loop_p (loop, off))
3368 return NULL_TREE;
3370 if (offtype == NULL_TREE)
3371 offtype = TREE_TYPE (off);
3373 if (DR_IS_READ (dr))
3374 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3375 offtype, scale);
3376 else
3377 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3378 offtype, scale);
3380 if (decl == NULL_TREE)
3381 return NULL_TREE;
3383 if (basep)
3384 *basep = base;
3385 if (offp)
3386 *offp = off;
3387 if (scalep)
3388 *scalep = scale;
3389 return decl;
3392 /* Function vect_analyze_data_refs.
3394 Find all the data references in the loop or basic block.
3396 The general structure of the analysis of data refs in the vectorizer is as
3397 follows:
3398 1- vect_analyze_data_refs(loop/bb): call
3399 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3400 in the loop/bb and their dependences.
3401 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3402 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3403 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3407 bool
3408 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3410 struct loop *loop = NULL;
3411 unsigned int i;
3412 struct data_reference *dr;
3413 tree scalar_type;
3415 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_NOTE, vect_location,
3417 "=== vect_analyze_data_refs ===\n");
3419 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3420 loop = LOOP_VINFO_LOOP (loop_vinfo);
3422 /* Go through the data-refs, check that the analysis succeeded. Update
3423 pointer from stmt_vec_info struct to DR and vectype. */
3425 vec<data_reference_p> datarefs = vinfo->datarefs;
3426 FOR_EACH_VEC_ELT (datarefs, i, dr)
3428 gimple *stmt;
3429 stmt_vec_info stmt_info;
3430 tree base, offset, init;
3431 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3432 bool simd_lane_access = false;
3433 int vf;
3435 again:
3436 if (!dr || !DR_REF (dr))
3438 if (dump_enabled_p ())
3439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3440 "not vectorized: unhandled data-ref\n");
3441 return false;
3444 stmt = DR_STMT (dr);
3445 stmt_info = vinfo_for_stmt (stmt);
3447 /* Discard clobbers from the dataref vector. We will remove
3448 clobber stmts during vectorization. */
3449 if (gimple_clobber_p (stmt))
3451 free_data_ref (dr);
3452 if (i == datarefs.length () - 1)
3454 datarefs.pop ();
3455 break;
3457 datarefs.ordered_remove (i);
3458 dr = datarefs[i];
3459 goto again;
3462 /* Check that analysis of the data-ref succeeded. */
3463 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3464 || !DR_STEP (dr))
3466 bool maybe_gather
3467 = DR_IS_READ (dr)
3468 && !TREE_THIS_VOLATILE (DR_REF (dr))
3469 && targetm.vectorize.builtin_gather != NULL;
3470 bool maybe_scatter
3471 = DR_IS_WRITE (dr)
3472 && !TREE_THIS_VOLATILE (DR_REF (dr))
3473 && targetm.vectorize.builtin_scatter != NULL;
3474 bool maybe_simd_lane_access
3475 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3477 /* If target supports vector gather loads or scatter stores, or if
3478 this might be a SIMD lane access, see if they can't be used. */
3479 if (is_a <loop_vec_info> (vinfo)
3480 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3481 && !nested_in_vect_loop_p (loop, stmt))
3483 struct data_reference *newdr
3484 = create_data_ref (NULL, loop_containing_stmt (stmt),
3485 DR_REF (dr), stmt, maybe_scatter ? false : true);
3486 gcc_assert (newdr != NULL && DR_REF (newdr));
3487 if (DR_BASE_ADDRESS (newdr)
3488 && DR_OFFSET (newdr)
3489 && DR_INIT (newdr)
3490 && DR_STEP (newdr)
3491 && integer_zerop (DR_STEP (newdr)))
3493 if (maybe_simd_lane_access)
3495 tree off = DR_OFFSET (newdr);
3496 STRIP_NOPS (off);
3497 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3498 && TREE_CODE (off) == MULT_EXPR
3499 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3501 tree step = TREE_OPERAND (off, 1);
3502 off = TREE_OPERAND (off, 0);
3503 STRIP_NOPS (off);
3504 if (CONVERT_EXPR_P (off)
3505 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3506 0)))
3507 < TYPE_PRECISION (TREE_TYPE (off)))
3508 off = TREE_OPERAND (off, 0);
3509 if (TREE_CODE (off) == SSA_NAME)
3511 gimple *def = SSA_NAME_DEF_STMT (off);
3512 tree reft = TREE_TYPE (DR_REF (newdr));
3513 if (is_gimple_call (def)
3514 && gimple_call_internal_p (def)
3515 && (gimple_call_internal_fn (def)
3516 == IFN_GOMP_SIMD_LANE))
3518 tree arg = gimple_call_arg (def, 0);
3519 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3520 arg = SSA_NAME_VAR (arg);
3521 if (arg == loop->simduid
3522 /* For now. */
3523 && tree_int_cst_equal
3524 (TYPE_SIZE_UNIT (reft),
3525 step))
3527 DR_OFFSET (newdr) = ssize_int (0);
3528 DR_STEP (newdr) = step;
3529 DR_ALIGNED_TO (newdr)
3530 = size_int (BIGGEST_ALIGNMENT);
3531 dr = newdr;
3532 simd_lane_access = true;
3538 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3540 dr = newdr;
3541 if (maybe_gather)
3542 gatherscatter = GATHER;
3543 else
3544 gatherscatter = SCATTER;
3547 if (gatherscatter == SG_NONE && !simd_lane_access)
3548 free_data_ref (newdr);
3551 if (gatherscatter == SG_NONE && !simd_lane_access)
3553 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3556 "not vectorized: data ref analysis "
3557 "failed ");
3558 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3559 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3562 if (is_a <bb_vec_info> (vinfo))
3563 break;
3565 return false;
3569 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3571 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3573 "not vectorized: base addr of dr is a "
3574 "constant\n");
3576 if (is_a <bb_vec_info> (vinfo))
3577 break;
3579 if (gatherscatter != SG_NONE || simd_lane_access)
3580 free_data_ref (dr);
3581 return false;
3584 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3586 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589 "not vectorized: volatile type ");
3590 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3591 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3594 if (is_a <bb_vec_info> (vinfo))
3595 break;
3597 return false;
3600 if (stmt_can_throw_internal (stmt))
3602 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3605 "not vectorized: statement can throw an "
3606 "exception ");
3607 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3608 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3611 if (is_a <bb_vec_info> (vinfo))
3612 break;
3614 if (gatherscatter != SG_NONE || simd_lane_access)
3615 free_data_ref (dr);
3616 return false;
3619 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3620 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3622 if (dump_enabled_p ())
3624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3625 "not vectorized: statement is bitfield "
3626 "access ");
3627 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3628 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3631 if (is_a <bb_vec_info> (vinfo))
3632 break;
3634 if (gatherscatter != SG_NONE || simd_lane_access)
3635 free_data_ref (dr);
3636 return false;
3639 base = unshare_expr (DR_BASE_ADDRESS (dr));
3640 offset = unshare_expr (DR_OFFSET (dr));
3641 init = unshare_expr (DR_INIT (dr));
3643 if (is_gimple_call (stmt)
3644 && (!gimple_call_internal_p (stmt)
3645 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3646 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3648 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3651 "not vectorized: dr in a call ");
3652 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3653 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3656 if (is_a <bb_vec_info> (vinfo))
3657 break;
3659 if (gatherscatter != SG_NONE || simd_lane_access)
3660 free_data_ref (dr);
3661 return false;
3664 /* Update DR field in stmt_vec_info struct. */
3666 /* If the dataref is in an inner-loop of the loop that is considered for
3667 for vectorization, we also want to analyze the access relative to
3668 the outer-loop (DR contains information only relative to the
3669 inner-most enclosing loop). We do that by building a reference to the
3670 first location accessed by the inner-loop, and analyze it relative to
3671 the outer-loop. */
3672 if (loop && nested_in_vect_loop_p (loop, stmt))
3674 tree outer_step, outer_base, outer_init;
3675 HOST_WIDE_INT pbitsize, pbitpos;
3676 tree poffset;
3677 machine_mode pmode;
3678 int punsignedp, preversep, pvolatilep;
3679 affine_iv base_iv, offset_iv;
3680 tree dinit;
3682 /* Build a reference to the first location accessed by the
3683 inner-loop: *(BASE+INIT). (The first location is actually
3684 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3685 tree inner_base = build_fold_indirect_ref
3686 (fold_build_pointer_plus (base, init));
3688 if (dump_enabled_p ())
3690 dump_printf_loc (MSG_NOTE, vect_location,
3691 "analyze in outer-loop: ");
3692 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3693 dump_printf (MSG_NOTE, "\n");
3696 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3697 &poffset, &pmode, &punsignedp,
3698 &preversep, &pvolatilep, false);
3699 gcc_assert (outer_base != NULL_TREE);
3701 if (pbitpos % BITS_PER_UNIT != 0)
3703 if (dump_enabled_p ())
3704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3705 "failed: bit offset alignment.\n");
3706 return false;
3709 if (preversep)
3711 if (dump_enabled_p ())
3712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3713 "failed: reverse storage order.\n");
3714 return false;
3717 outer_base = build_fold_addr_expr (outer_base);
3718 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3719 &base_iv, false))
3721 if (dump_enabled_p ())
3722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3723 "failed: evolution of base is not affine.\n");
3724 return false;
3727 if (offset)
3729 if (poffset)
3730 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3731 poffset);
3732 else
3733 poffset = offset;
3736 if (!poffset)
3738 offset_iv.base = ssize_int (0);
3739 offset_iv.step = ssize_int (0);
3741 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3742 &offset_iv, false))
3744 if (dump_enabled_p ())
3745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3746 "evolution of offset is not affine.\n");
3747 return false;
3750 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3751 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3752 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3753 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3754 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3756 outer_step = size_binop (PLUS_EXPR,
3757 fold_convert (ssizetype, base_iv.step),
3758 fold_convert (ssizetype, offset_iv.step));
3760 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3761 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3762 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3763 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3764 STMT_VINFO_DR_OFFSET (stmt_info) =
3765 fold_convert (ssizetype, offset_iv.base);
3766 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3767 size_int (highest_pow2_factor (offset_iv.base));
3769 if (dump_enabled_p ())
3771 dump_printf_loc (MSG_NOTE, vect_location,
3772 "\touter base_address: ");
3773 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3774 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3775 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3776 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3777 STMT_VINFO_DR_OFFSET (stmt_info));
3778 dump_printf (MSG_NOTE,
3779 "\n\touter constant offset from base address: ");
3780 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3781 STMT_VINFO_DR_INIT (stmt_info));
3782 dump_printf (MSG_NOTE, "\n\touter step: ");
3783 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3784 STMT_VINFO_DR_STEP (stmt_info));
3785 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3786 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3787 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3788 dump_printf (MSG_NOTE, "\n");
3792 if (STMT_VINFO_DATA_REF (stmt_info))
3794 if (dump_enabled_p ())
3796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3797 "not vectorized: more than one data ref "
3798 "in stmt: ");
3799 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3800 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3803 if (is_a <bb_vec_info> (vinfo))
3804 break;
3806 if (gatherscatter != SG_NONE || simd_lane_access)
3807 free_data_ref (dr);
3808 return false;
3811 STMT_VINFO_DATA_REF (stmt_info) = dr;
3812 if (simd_lane_access)
3814 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3815 free_data_ref (datarefs[i]);
3816 datarefs[i] = dr;
3819 /* Set vectype for STMT. */
3820 scalar_type = TREE_TYPE (DR_REF (dr));
3821 STMT_VINFO_VECTYPE (stmt_info)
3822 = get_vectype_for_scalar_type (scalar_type);
3823 if (!STMT_VINFO_VECTYPE (stmt_info))
3825 if (dump_enabled_p ())
3827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3828 "not vectorized: no vectype for stmt: ");
3829 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3830 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3831 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3832 scalar_type);
3833 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3836 if (is_a <bb_vec_info> (vinfo))
3838 /* No vector type is fine, the ref can still participate
3839 in dependence analysis, we just can't vectorize it. */
3840 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3841 continue;
3844 if (gatherscatter != SG_NONE || simd_lane_access)
3846 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3847 if (gatherscatter != SG_NONE)
3848 free_data_ref (dr);
3850 return false;
3852 else
3854 if (dump_enabled_p ())
3856 dump_printf_loc (MSG_NOTE, vect_location,
3857 "got vectype for stmt: ");
3858 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3859 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3860 STMT_VINFO_VECTYPE (stmt_info));
3861 dump_printf (MSG_NOTE, "\n");
3865 /* Adjust the minimal vectorization factor according to the
3866 vector type. */
3867 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3868 if (vf > *min_vf)
3869 *min_vf = vf;
3871 if (gatherscatter != SG_NONE)
3873 tree off;
3874 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3875 NULL, &off, NULL)
3876 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3878 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3879 free_data_ref (dr);
3880 if (dump_enabled_p ())
3882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3883 (gatherscatter == GATHER) ?
3884 "not vectorized: not suitable for gather "
3885 "load " :
3886 "not vectorized: not suitable for scatter "
3887 "store ");
3888 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3889 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3891 return false;
3894 free_data_ref (datarefs[i]);
3895 datarefs[i] = dr;
3896 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3899 else if (is_a <loop_vec_info> (vinfo)
3900 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3902 if (nested_in_vect_loop_p (loop, stmt))
3904 if (dump_enabled_p ())
3906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3907 "not vectorized: not suitable for strided "
3908 "load ");
3909 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3910 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3912 return false;
3914 STMT_VINFO_STRIDED_P (stmt_info) = true;
3918 /* If we stopped analysis at the first dataref we could not analyze
3919 when trying to vectorize a basic-block mark the rest of the datarefs
3920 as not vectorizable and truncate the vector of datarefs. That
3921 avoids spending useless time in analyzing their dependence. */
3922 if (i != datarefs.length ())
3924 gcc_assert (is_a <bb_vec_info> (vinfo));
3925 for (unsigned j = i; j < datarefs.length (); ++j)
3927 data_reference_p dr = datarefs[j];
3928 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3929 free_data_ref (dr);
3931 datarefs.truncate (i);
3934 return true;
3938 /* Function vect_get_new_vect_var.
3940 Returns a name for a new variable. The current naming scheme appends the
3941 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3942 the name of vectorizer generated variables, and appends that to NAME if
3943 provided. */
3945 tree
3946 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3948 const char *prefix;
3949 tree new_vect_var;
3951 switch (var_kind)
3953 case vect_simple_var:
3954 prefix = "vect";
3955 break;
3956 case vect_scalar_var:
3957 prefix = "stmp";
3958 break;
3959 case vect_mask_var:
3960 prefix = "mask";
3961 break;
3962 case vect_pointer_var:
3963 prefix = "vectp";
3964 break;
3965 default:
3966 gcc_unreachable ();
3969 if (name)
3971 char* tmp = concat (prefix, "_", name, NULL);
3972 new_vect_var = create_tmp_reg (type, tmp);
3973 free (tmp);
3975 else
3976 new_vect_var = create_tmp_reg (type, prefix);
3978 return new_vect_var;
3981 /* Like vect_get_new_vect_var but return an SSA name. */
3983 tree
3984 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3986 const char *prefix;
3987 tree new_vect_var;
3989 switch (var_kind)
3991 case vect_simple_var:
3992 prefix = "vect";
3993 break;
3994 case vect_scalar_var:
3995 prefix = "stmp";
3996 break;
3997 case vect_pointer_var:
3998 prefix = "vectp";
3999 break;
4000 default:
4001 gcc_unreachable ();
4004 if (name)
4006 char* tmp = concat (prefix, "_", name, NULL);
4007 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4008 free (tmp);
4010 else
4011 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4013 return new_vect_var;
4016 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4018 static void
4019 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4020 stmt_vec_info stmt_info)
4022 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4023 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4024 int misalign = DR_MISALIGNMENT (dr);
4025 if (misalign == -1)
4026 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4027 else
4028 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4031 /* Function vect_create_addr_base_for_vector_ref.
4033 Create an expression that computes the address of the first memory location
4034 that will be accessed for a data reference.
4036 Input:
4037 STMT: The statement containing the data reference.
4038 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4039 OFFSET: Optional. If supplied, it is be added to the initial address.
4040 LOOP: Specify relative to which loop-nest should the address be computed.
4041 For example, when the dataref is in an inner-loop nested in an
4042 outer-loop that is now being vectorized, LOOP can be either the
4043 outer-loop, or the inner-loop. The first memory location accessed
4044 by the following dataref ('in' points to short):
4046 for (i=0; i<N; i++)
4047 for (j=0; j<M; j++)
4048 s += in[i+j]
4050 is as follows:
4051 if LOOP=i_loop: &in (relative to i_loop)
4052 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4053 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4054 initial address. Unlike OFFSET, which is number of elements to
4055 be added, BYTE_OFFSET is measured in bytes.
4057 Output:
4058 1. Return an SSA_NAME whose value is the address of the memory location of
4059 the first vector of the data reference.
4060 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4061 these statement(s) which define the returned SSA_NAME.
4063 FORNOW: We are only handling array accesses with step 1. */
4065 tree
4066 vect_create_addr_base_for_vector_ref (gimple *stmt,
4067 gimple_seq *new_stmt_list,
4068 tree offset,
4069 struct loop *loop,
4070 tree byte_offset)
4072 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4073 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4074 tree data_ref_base;
4075 const char *base_name;
4076 tree addr_base;
4077 tree dest;
4078 gimple_seq seq = NULL;
4079 tree base_offset;
4080 tree init;
4081 tree vect_ptr_type;
4082 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4083 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4085 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4087 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4089 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4091 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4092 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4093 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4095 else
4097 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4098 base_offset = unshare_expr (DR_OFFSET (dr));
4099 init = unshare_expr (DR_INIT (dr));
4102 if (loop_vinfo)
4103 base_name = get_name (data_ref_base);
4104 else
4106 base_offset = ssize_int (0);
4107 init = ssize_int (0);
4108 base_name = get_name (DR_REF (dr));
4111 /* Create base_offset */
4112 base_offset = size_binop (PLUS_EXPR,
4113 fold_convert (sizetype, base_offset),
4114 fold_convert (sizetype, init));
4116 if (offset)
4118 offset = fold_build2 (MULT_EXPR, sizetype,
4119 fold_convert (sizetype, offset), step);
4120 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4121 base_offset, offset);
4123 if (byte_offset)
4125 byte_offset = fold_convert (sizetype, byte_offset);
4126 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4127 base_offset, byte_offset);
4130 /* base + base_offset */
4131 if (loop_vinfo)
4132 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4133 else
4135 addr_base = build1 (ADDR_EXPR,
4136 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4137 unshare_expr (DR_REF (dr)));
4140 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4141 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4142 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4143 gimple_seq_add_seq (new_stmt_list, seq);
4145 if (DR_PTR_INFO (dr)
4146 && TREE_CODE (addr_base) == SSA_NAME
4147 && !SSA_NAME_PTR_INFO (addr_base))
4149 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4150 if (offset || byte_offset)
4151 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4154 if (dump_enabled_p ())
4156 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4157 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4158 dump_printf (MSG_NOTE, "\n");
4161 return addr_base;
4165 /* Function vect_create_data_ref_ptr.
4167 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4168 location accessed in the loop by STMT, along with the def-use update
4169 chain to appropriately advance the pointer through the loop iterations.
4170 Also set aliasing information for the pointer. This pointer is used by
4171 the callers to this function to create a memory reference expression for
4172 vector load/store access.
4174 Input:
4175 1. STMT: a stmt that references memory. Expected to be of the form
4176 GIMPLE_ASSIGN <name, data-ref> or
4177 GIMPLE_ASSIGN <data-ref, name>.
4178 2. AGGR_TYPE: the type of the reference, which should be either a vector
4179 or an array.
4180 3. AT_LOOP: the loop where the vector memref is to be created.
4181 4. OFFSET (optional): an offset to be added to the initial address accessed
4182 by the data-ref in STMT.
4183 5. BSI: location where the new stmts are to be placed if there is no loop
4184 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4185 pointing to the initial address.
4186 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4187 to the initial address accessed by the data-ref in STMT. This is
4188 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4189 in bytes.
4191 Output:
4192 1. Declare a new ptr to vector_type, and have it point to the base of the
4193 data reference (initial addressed accessed by the data reference).
4194 For example, for vector of type V8HI, the following code is generated:
4196 v8hi *ap;
4197 ap = (v8hi *)initial_address;
4199 if OFFSET is not supplied:
4200 initial_address = &a[init];
4201 if OFFSET is supplied:
4202 initial_address = &a[init + OFFSET];
4203 if BYTE_OFFSET is supplied:
4204 initial_address = &a[init] + BYTE_OFFSET;
4206 Return the initial_address in INITIAL_ADDRESS.
4208 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4209 update the pointer in each iteration of the loop.
4211 Return the increment stmt that updates the pointer in PTR_INCR.
4213 3. Set INV_P to true if the access pattern of the data reference in the
4214 vectorized loop is invariant. Set it to false otherwise.
4216 4. Return the pointer. */
4218 tree
4219 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4220 tree offset, tree *initial_address,
4221 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4222 bool only_init, bool *inv_p, tree byte_offset)
4224 const char *base_name;
4225 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4226 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4227 struct loop *loop = NULL;
4228 bool nested_in_vect_loop = false;
4229 struct loop *containing_loop = NULL;
4230 tree aggr_ptr_type;
4231 tree aggr_ptr;
4232 tree new_temp;
4233 gimple_seq new_stmt_list = NULL;
4234 edge pe = NULL;
4235 basic_block new_bb;
4236 tree aggr_ptr_init;
4237 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4238 tree aptr;
4239 gimple_stmt_iterator incr_gsi;
4240 bool insert_after;
4241 tree indx_before_incr, indx_after_incr;
4242 gimple *incr;
4243 tree step;
4244 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4246 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4247 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4249 if (loop_vinfo)
4251 loop = LOOP_VINFO_LOOP (loop_vinfo);
4252 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4253 containing_loop = (gimple_bb (stmt))->loop_father;
4254 pe = loop_preheader_edge (loop);
4256 else
4258 gcc_assert (bb_vinfo);
4259 only_init = true;
4260 *ptr_incr = NULL;
4263 /* Check the step (evolution) of the load in LOOP, and record
4264 whether it's invariant. */
4265 if (nested_in_vect_loop)
4266 step = STMT_VINFO_DR_STEP (stmt_info);
4267 else
4268 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4270 if (integer_zerop (step))
4271 *inv_p = true;
4272 else
4273 *inv_p = false;
4275 /* Create an expression for the first address accessed by this load
4276 in LOOP. */
4277 base_name = get_name (DR_BASE_ADDRESS (dr));
4279 if (dump_enabled_p ())
4281 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4282 dump_printf_loc (MSG_NOTE, vect_location,
4283 "create %s-pointer variable to type: ",
4284 get_tree_code_name (TREE_CODE (aggr_type)));
4285 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4286 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4287 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4288 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4289 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4290 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4291 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4292 else
4293 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4294 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4295 dump_printf (MSG_NOTE, "\n");
4298 /* (1) Create the new aggregate-pointer variable.
4299 Vector and array types inherit the alias set of their component
4300 type by default so we need to use a ref-all pointer if the data
4301 reference does not conflict with the created aggregated data
4302 reference because it is not addressable. */
4303 bool need_ref_all = false;
4304 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4305 get_alias_set (DR_REF (dr))))
4306 need_ref_all = true;
4307 /* Likewise for any of the data references in the stmt group. */
4308 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4310 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4313 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4314 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4315 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4316 get_alias_set (DR_REF (sdr))))
4318 need_ref_all = true;
4319 break;
4321 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4323 while (orig_stmt);
4325 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4326 need_ref_all);
4327 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4330 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4331 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4332 def-use update cycles for the pointer: one relative to the outer-loop
4333 (LOOP), which is what steps (3) and (4) below do. The other is relative
4334 to the inner-loop (which is the inner-most loop containing the dataref),
4335 and this is done be step (5) below.
4337 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4338 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4339 redundant. Steps (3),(4) create the following:
4341 vp0 = &base_addr;
4342 LOOP: vp1 = phi(vp0,vp2)
4345 vp2 = vp1 + step
4346 goto LOOP
4348 If there is an inner-loop nested in loop, then step (5) will also be
4349 applied, and an additional update in the inner-loop will be created:
4351 vp0 = &base_addr;
4352 LOOP: vp1 = phi(vp0,vp2)
4354 inner: vp3 = phi(vp1,vp4)
4355 vp4 = vp3 + inner_step
4356 if () goto inner
4358 vp2 = vp1 + step
4359 if () goto LOOP */
4361 /* (2) Calculate the initial address of the aggregate-pointer, and set
4362 the aggregate-pointer to point to it before the loop. */
4364 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4366 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4367 offset, loop, byte_offset);
4368 if (new_stmt_list)
4370 if (pe)
4372 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4373 gcc_assert (!new_bb);
4375 else
4376 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4379 *initial_address = new_temp;
4380 aggr_ptr_init = new_temp;
4382 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4383 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4384 inner-loop nested in LOOP (during outer-loop vectorization). */
4386 /* No update in loop is required. */
4387 if (only_init && (!loop_vinfo || at_loop == loop))
4388 aptr = aggr_ptr_init;
4389 else
4391 /* The step of the aggregate pointer is the type size. */
4392 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4393 /* One exception to the above is when the scalar step of the load in
4394 LOOP is zero. In this case the step here is also zero. */
4395 if (*inv_p)
4396 iv_step = size_zero_node;
4397 else if (tree_int_cst_sgn (step) == -1)
4398 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4400 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4402 create_iv (aggr_ptr_init,
4403 fold_convert (aggr_ptr_type, iv_step),
4404 aggr_ptr, loop, &incr_gsi, insert_after,
4405 &indx_before_incr, &indx_after_incr);
4406 incr = gsi_stmt (incr_gsi);
4407 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4409 /* Copy the points-to information if it exists. */
4410 if (DR_PTR_INFO (dr))
4412 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4413 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4415 if (ptr_incr)
4416 *ptr_incr = incr;
4418 aptr = indx_before_incr;
4421 if (!nested_in_vect_loop || only_init)
4422 return aptr;
4425 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4426 nested in LOOP, if exists. */
4428 gcc_assert (nested_in_vect_loop);
4429 if (!only_init)
4431 standard_iv_increment_position (containing_loop, &incr_gsi,
4432 &insert_after);
4433 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4434 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4435 &indx_after_incr);
4436 incr = gsi_stmt (incr_gsi);
4437 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4439 /* Copy the points-to information if it exists. */
4440 if (DR_PTR_INFO (dr))
4442 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4443 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4445 if (ptr_incr)
4446 *ptr_incr = incr;
4448 return indx_before_incr;
4450 else
4451 gcc_unreachable ();
4455 /* Function bump_vector_ptr
4457 Increment a pointer (to a vector type) by vector-size. If requested,
4458 i.e. if PTR-INCR is given, then also connect the new increment stmt
4459 to the existing def-use update-chain of the pointer, by modifying
4460 the PTR_INCR as illustrated below:
4462 The pointer def-use update-chain before this function:
4463 DATAREF_PTR = phi (p_0, p_2)
4464 ....
4465 PTR_INCR: p_2 = DATAREF_PTR + step
4467 The pointer def-use update-chain after this function:
4468 DATAREF_PTR = phi (p_0, p_2)
4469 ....
4470 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4471 ....
4472 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4474 Input:
4475 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4476 in the loop.
4477 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4478 the loop. The increment amount across iterations is expected
4479 to be vector_size.
4480 BSI - location where the new update stmt is to be placed.
4481 STMT - the original scalar memory-access stmt that is being vectorized.
4482 BUMP - optional. The offset by which to bump the pointer. If not given,
4483 the offset is assumed to be vector_size.
4485 Output: Return NEW_DATAREF_PTR as illustrated above.
4489 tree
4490 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4491 gimple *stmt, tree bump)
4493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4494 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4495 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4496 tree update = TYPE_SIZE_UNIT (vectype);
4497 gassign *incr_stmt;
4498 ssa_op_iter iter;
4499 use_operand_p use_p;
4500 tree new_dataref_ptr;
4502 if (bump)
4503 update = bump;
4505 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4506 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4507 else
4508 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4509 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4510 dataref_ptr, update);
4511 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4513 /* Copy the points-to information if it exists. */
4514 if (DR_PTR_INFO (dr))
4516 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4517 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4520 if (!ptr_incr)
4521 return new_dataref_ptr;
4523 /* Update the vector-pointer's cross-iteration increment. */
4524 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4526 tree use = USE_FROM_PTR (use_p);
4528 if (use == dataref_ptr)
4529 SET_USE (use_p, new_dataref_ptr);
4530 else
4531 gcc_assert (tree_int_cst_compare (use, update) == 0);
4534 return new_dataref_ptr;
4538 /* Function vect_create_destination_var.
4540 Create a new temporary of type VECTYPE. */
4542 tree
4543 vect_create_destination_var (tree scalar_dest, tree vectype)
4545 tree vec_dest;
4546 const char *name;
4547 char *new_name;
4548 tree type;
4549 enum vect_var_kind kind;
4551 kind = vectype
4552 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4553 ? vect_mask_var
4554 : vect_simple_var
4555 : vect_scalar_var;
4556 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4558 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4560 name = get_name (scalar_dest);
4561 if (name)
4562 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4563 else
4564 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4565 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4566 free (new_name);
4568 return vec_dest;
4571 /* Function vect_grouped_store_supported.
4573 Returns TRUE if interleave high and interleave low permutations
4574 are supported, and FALSE otherwise. */
4576 bool
4577 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4579 machine_mode mode = TYPE_MODE (vectype);
4581 /* vect_permute_store_chain requires the group size to be equal to 3 or
4582 be a power of two. */
4583 if (count != 3 && exact_log2 (count) == -1)
4585 if (dump_enabled_p ())
4586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4587 "the size of the group of accesses"
4588 " is not a power of 2 or not eqaul to 3\n");
4589 return false;
4592 /* Check that the permutation is supported. */
4593 if (VECTOR_MODE_P (mode))
4595 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4596 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4598 if (count == 3)
4600 unsigned int j0 = 0, j1 = 0, j2 = 0;
4601 unsigned int i, j;
4603 for (j = 0; j < 3; j++)
4605 int nelt0 = ((3 - j) * nelt) % 3;
4606 int nelt1 = ((3 - j) * nelt + 1) % 3;
4607 int nelt2 = ((3 - j) * nelt + 2) % 3;
4608 for (i = 0; i < nelt; i++)
4610 if (3 * i + nelt0 < nelt)
4611 sel[3 * i + nelt0] = j0++;
4612 if (3 * i + nelt1 < nelt)
4613 sel[3 * i + nelt1] = nelt + j1++;
4614 if (3 * i + nelt2 < nelt)
4615 sel[3 * i + nelt2] = 0;
4617 if (!can_vec_perm_p (mode, false, sel))
4619 if (dump_enabled_p ())
4620 dump_printf (MSG_MISSED_OPTIMIZATION,
4621 "permutaion op not supported by target.\n");
4622 return false;
4625 for (i = 0; i < nelt; i++)
4627 if (3 * i + nelt0 < nelt)
4628 sel[3 * i + nelt0] = 3 * i + nelt0;
4629 if (3 * i + nelt1 < nelt)
4630 sel[3 * i + nelt1] = 3 * i + nelt1;
4631 if (3 * i + nelt2 < nelt)
4632 sel[3 * i + nelt2] = nelt + j2++;
4634 if (!can_vec_perm_p (mode, false, sel))
4636 if (dump_enabled_p ())
4637 dump_printf (MSG_MISSED_OPTIMIZATION,
4638 "permutaion op not supported by target.\n");
4639 return false;
4642 return true;
4644 else
4646 /* If length is not equal to 3 then only power of 2 is supported. */
4647 gcc_assert (exact_log2 (count) != -1);
4649 for (i = 0; i < nelt / 2; i++)
4651 sel[i * 2] = i;
4652 sel[i * 2 + 1] = i + nelt;
4654 if (can_vec_perm_p (mode, false, sel))
4656 for (i = 0; i < nelt; i++)
4657 sel[i] += nelt / 2;
4658 if (can_vec_perm_p (mode, false, sel))
4659 return true;
4664 if (dump_enabled_p ())
4665 dump_printf (MSG_MISSED_OPTIMIZATION,
4666 "permutaion op not supported by target.\n");
4667 return false;
4671 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4672 type VECTYPE. */
4674 bool
4675 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4677 return vect_lanes_optab_supported_p ("vec_store_lanes",
4678 vec_store_lanes_optab,
4679 vectype, count);
4683 /* Function vect_permute_store_chain.
4685 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4686 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4687 the data correctly for the stores. Return the final references for stores
4688 in RESULT_CHAIN.
4690 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4691 The input is 4 vectors each containing 8 elements. We assign a number to
4692 each element, the input sequence is:
4694 1st vec: 0 1 2 3 4 5 6 7
4695 2nd vec: 8 9 10 11 12 13 14 15
4696 3rd vec: 16 17 18 19 20 21 22 23
4697 4th vec: 24 25 26 27 28 29 30 31
4699 The output sequence should be:
4701 1st vec: 0 8 16 24 1 9 17 25
4702 2nd vec: 2 10 18 26 3 11 19 27
4703 3rd vec: 4 12 20 28 5 13 21 30
4704 4th vec: 6 14 22 30 7 15 23 31
4706 i.e., we interleave the contents of the four vectors in their order.
4708 We use interleave_high/low instructions to create such output. The input of
4709 each interleave_high/low operation is two vectors:
4710 1st vec 2nd vec
4711 0 1 2 3 4 5 6 7
4712 the even elements of the result vector are obtained left-to-right from the
4713 high/low elements of the first vector. The odd elements of the result are
4714 obtained left-to-right from the high/low elements of the second vector.
4715 The output of interleave_high will be: 0 4 1 5
4716 and of interleave_low: 2 6 3 7
4719 The permutation is done in log LENGTH stages. In each stage interleave_high
4720 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4721 where the first argument is taken from the first half of DR_CHAIN and the
4722 second argument from it's second half.
4723 In our example,
4725 I1: interleave_high (1st vec, 3rd vec)
4726 I2: interleave_low (1st vec, 3rd vec)
4727 I3: interleave_high (2nd vec, 4th vec)
4728 I4: interleave_low (2nd vec, 4th vec)
4730 The output for the first stage is:
4732 I1: 0 16 1 17 2 18 3 19
4733 I2: 4 20 5 21 6 22 7 23
4734 I3: 8 24 9 25 10 26 11 27
4735 I4: 12 28 13 29 14 30 15 31
4737 The output of the second stage, i.e. the final result is:
4739 I1: 0 8 16 24 1 9 17 25
4740 I2: 2 10 18 26 3 11 19 27
4741 I3: 4 12 20 28 5 13 21 30
4742 I4: 6 14 22 30 7 15 23 31. */
4744 void
4745 vect_permute_store_chain (vec<tree> dr_chain,
4746 unsigned int length,
4747 gimple *stmt,
4748 gimple_stmt_iterator *gsi,
4749 vec<tree> *result_chain)
4751 tree vect1, vect2, high, low;
4752 gimple *perm_stmt;
4753 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4754 tree perm_mask_low, perm_mask_high;
4755 tree data_ref;
4756 tree perm3_mask_low, perm3_mask_high;
4757 unsigned int i, n, log_length = exact_log2 (length);
4758 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4759 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4761 result_chain->quick_grow (length);
4762 memcpy (result_chain->address (), dr_chain.address (),
4763 length * sizeof (tree));
4765 if (length == 3)
4767 unsigned int j0 = 0, j1 = 0, j2 = 0;
4769 for (j = 0; j < 3; j++)
4771 int nelt0 = ((3 - j) * nelt) % 3;
4772 int nelt1 = ((3 - j) * nelt + 1) % 3;
4773 int nelt2 = ((3 - j) * nelt + 2) % 3;
4775 for (i = 0; i < nelt; i++)
4777 if (3 * i + nelt0 < nelt)
4778 sel[3 * i + nelt0] = j0++;
4779 if (3 * i + nelt1 < nelt)
4780 sel[3 * i + nelt1] = nelt + j1++;
4781 if (3 * i + nelt2 < nelt)
4782 sel[3 * i + nelt2] = 0;
4784 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4786 for (i = 0; i < nelt; i++)
4788 if (3 * i + nelt0 < nelt)
4789 sel[3 * i + nelt0] = 3 * i + nelt0;
4790 if (3 * i + nelt1 < nelt)
4791 sel[3 * i + nelt1] = 3 * i + nelt1;
4792 if (3 * i + nelt2 < nelt)
4793 sel[3 * i + nelt2] = nelt + j2++;
4795 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4797 vect1 = dr_chain[0];
4798 vect2 = dr_chain[1];
4800 /* Create interleaving stmt:
4801 low = VEC_PERM_EXPR <vect1, vect2,
4802 {j, nelt, *, j + 1, nelt + j + 1, *,
4803 j + 2, nelt + j + 2, *, ...}> */
4804 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4805 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4806 vect2, perm3_mask_low);
4807 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4809 vect1 = data_ref;
4810 vect2 = dr_chain[2];
4811 /* Create interleaving stmt:
4812 low = VEC_PERM_EXPR <vect1, vect2,
4813 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4814 6, 7, nelt + j + 2, ...}> */
4815 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4816 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4817 vect2, perm3_mask_high);
4818 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4819 (*result_chain)[j] = data_ref;
4822 else
4824 /* If length is not equal to 3 then only power of 2 is supported. */
4825 gcc_assert (exact_log2 (length) != -1);
4827 for (i = 0, n = nelt / 2; i < n; i++)
4829 sel[i * 2] = i;
4830 sel[i * 2 + 1] = i + nelt;
4832 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4834 for (i = 0; i < nelt; i++)
4835 sel[i] += nelt / 2;
4836 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4838 for (i = 0, n = log_length; i < n; i++)
4840 for (j = 0; j < length/2; j++)
4842 vect1 = dr_chain[j];
4843 vect2 = dr_chain[j+length/2];
4845 /* Create interleaving stmt:
4846 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4847 ...}> */
4848 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4849 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4850 vect2, perm_mask_high);
4851 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4852 (*result_chain)[2*j] = high;
4854 /* Create interleaving stmt:
4855 low = VEC_PERM_EXPR <vect1, vect2,
4856 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4857 ...}> */
4858 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4859 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4860 vect2, perm_mask_low);
4861 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4862 (*result_chain)[2*j+1] = low;
4864 memcpy (dr_chain.address (), result_chain->address (),
4865 length * sizeof (tree));
4870 /* Function vect_setup_realignment
4872 This function is called when vectorizing an unaligned load using
4873 the dr_explicit_realign[_optimized] scheme.
4874 This function generates the following code at the loop prolog:
4876 p = initial_addr;
4877 x msq_init = *(floor(p)); # prolog load
4878 realignment_token = call target_builtin;
4879 loop:
4880 x msq = phi (msq_init, ---)
4882 The stmts marked with x are generated only for the case of
4883 dr_explicit_realign_optimized.
4885 The code above sets up a new (vector) pointer, pointing to the first
4886 location accessed by STMT, and a "floor-aligned" load using that pointer.
4887 It also generates code to compute the "realignment-token" (if the relevant
4888 target hook was defined), and creates a phi-node at the loop-header bb
4889 whose arguments are the result of the prolog-load (created by this
4890 function) and the result of a load that takes place in the loop (to be
4891 created by the caller to this function).
4893 For the case of dr_explicit_realign_optimized:
4894 The caller to this function uses the phi-result (msq) to create the
4895 realignment code inside the loop, and sets up the missing phi argument,
4896 as follows:
4897 loop:
4898 msq = phi (msq_init, lsq)
4899 lsq = *(floor(p')); # load in loop
4900 result = realign_load (msq, lsq, realignment_token);
4902 For the case of dr_explicit_realign:
4903 loop:
4904 msq = *(floor(p)); # load in loop
4905 p' = p + (VS-1);
4906 lsq = *(floor(p')); # load in loop
4907 result = realign_load (msq, lsq, realignment_token);
4909 Input:
4910 STMT - (scalar) load stmt to be vectorized. This load accesses
4911 a memory location that may be unaligned.
4912 BSI - place where new code is to be inserted.
4913 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4914 is used.
4916 Output:
4917 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4918 target hook, if defined.
4919 Return value - the result of the loop-header phi node. */
4921 tree
4922 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4923 tree *realignment_token,
4924 enum dr_alignment_support alignment_support_scheme,
4925 tree init_addr,
4926 struct loop **at_loop)
4928 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4929 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4930 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4931 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4932 struct loop *loop = NULL;
4933 edge pe = NULL;
4934 tree scalar_dest = gimple_assign_lhs (stmt);
4935 tree vec_dest;
4936 gimple *inc;
4937 tree ptr;
4938 tree data_ref;
4939 basic_block new_bb;
4940 tree msq_init = NULL_TREE;
4941 tree new_temp;
4942 gphi *phi_stmt;
4943 tree msq = NULL_TREE;
4944 gimple_seq stmts = NULL;
4945 bool inv_p;
4946 bool compute_in_loop = false;
4947 bool nested_in_vect_loop = false;
4948 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4949 struct loop *loop_for_initial_load = NULL;
4951 if (loop_vinfo)
4953 loop = LOOP_VINFO_LOOP (loop_vinfo);
4954 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4957 gcc_assert (alignment_support_scheme == dr_explicit_realign
4958 || alignment_support_scheme == dr_explicit_realign_optimized);
4960 /* We need to generate three things:
4961 1. the misalignment computation
4962 2. the extra vector load (for the optimized realignment scheme).
4963 3. the phi node for the two vectors from which the realignment is
4964 done (for the optimized realignment scheme). */
4966 /* 1. Determine where to generate the misalignment computation.
4968 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4969 calculation will be generated by this function, outside the loop (in the
4970 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4971 caller, inside the loop.
4973 Background: If the misalignment remains fixed throughout the iterations of
4974 the loop, then both realignment schemes are applicable, and also the
4975 misalignment computation can be done outside LOOP. This is because we are
4976 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4977 are a multiple of VS (the Vector Size), and therefore the misalignment in
4978 different vectorized LOOP iterations is always the same.
4979 The problem arises only if the memory access is in an inner-loop nested
4980 inside LOOP, which is now being vectorized using outer-loop vectorization.
4981 This is the only case when the misalignment of the memory access may not
4982 remain fixed throughout the iterations of the inner-loop (as explained in
4983 detail in vect_supportable_dr_alignment). In this case, not only is the
4984 optimized realignment scheme not applicable, but also the misalignment
4985 computation (and generation of the realignment token that is passed to
4986 REALIGN_LOAD) have to be done inside the loop.
4988 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4989 or not, which in turn determines if the misalignment is computed inside
4990 the inner-loop, or outside LOOP. */
4992 if (init_addr != NULL_TREE || !loop_vinfo)
4994 compute_in_loop = true;
4995 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4999 /* 2. Determine where to generate the extra vector load.
5001 For the optimized realignment scheme, instead of generating two vector
5002 loads in each iteration, we generate a single extra vector load in the
5003 preheader of the loop, and in each iteration reuse the result of the
5004 vector load from the previous iteration. In case the memory access is in
5005 an inner-loop nested inside LOOP, which is now being vectorized using
5006 outer-loop vectorization, we need to determine whether this initial vector
5007 load should be generated at the preheader of the inner-loop, or can be
5008 generated at the preheader of LOOP. If the memory access has no evolution
5009 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5010 to be generated inside LOOP (in the preheader of the inner-loop). */
5012 if (nested_in_vect_loop)
5014 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5015 bool invariant_in_outerloop =
5016 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5017 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5019 else
5020 loop_for_initial_load = loop;
5021 if (at_loop)
5022 *at_loop = loop_for_initial_load;
5024 if (loop_for_initial_load)
5025 pe = loop_preheader_edge (loop_for_initial_load);
5027 /* 3. For the case of the optimized realignment, create the first vector
5028 load at the loop preheader. */
5030 if (alignment_support_scheme == dr_explicit_realign_optimized)
5032 /* Create msq_init = *(floor(p1)) in the loop preheader */
5033 gassign *new_stmt;
5035 gcc_assert (!compute_in_loop);
5036 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5037 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5038 NULL_TREE, &init_addr, NULL, &inc,
5039 true, &inv_p);
5040 if (TREE_CODE (ptr) == SSA_NAME)
5041 new_temp = copy_ssa_name (ptr);
5042 else
5043 new_temp = make_ssa_name (TREE_TYPE (ptr));
5044 new_stmt = gimple_build_assign
5045 (new_temp, BIT_AND_EXPR, ptr,
5046 build_int_cst (TREE_TYPE (ptr),
5047 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5048 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5049 gcc_assert (!new_bb);
5050 data_ref
5051 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5052 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5053 new_stmt = gimple_build_assign (vec_dest, data_ref);
5054 new_temp = make_ssa_name (vec_dest, new_stmt);
5055 gimple_assign_set_lhs (new_stmt, new_temp);
5056 if (pe)
5058 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5059 gcc_assert (!new_bb);
5061 else
5062 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5064 msq_init = gimple_assign_lhs (new_stmt);
5067 /* 4. Create realignment token using a target builtin, if available.
5068 It is done either inside the containing loop, or before LOOP (as
5069 determined above). */
5071 if (targetm.vectorize.builtin_mask_for_load)
5073 gcall *new_stmt;
5074 tree builtin_decl;
5076 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5077 if (!init_addr)
5079 /* Generate the INIT_ADDR computation outside LOOP. */
5080 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5081 NULL_TREE, loop);
5082 if (loop)
5084 pe = loop_preheader_edge (loop);
5085 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5086 gcc_assert (!new_bb);
5088 else
5089 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5092 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5093 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5094 vec_dest =
5095 vect_create_destination_var (scalar_dest,
5096 gimple_call_return_type (new_stmt));
5097 new_temp = make_ssa_name (vec_dest, new_stmt);
5098 gimple_call_set_lhs (new_stmt, new_temp);
5100 if (compute_in_loop)
5101 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5102 else
5104 /* Generate the misalignment computation outside LOOP. */
5105 pe = loop_preheader_edge (loop);
5106 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5107 gcc_assert (!new_bb);
5110 *realignment_token = gimple_call_lhs (new_stmt);
5112 /* The result of the CALL_EXPR to this builtin is determined from
5113 the value of the parameter and no global variables are touched
5114 which makes the builtin a "const" function. Requiring the
5115 builtin to have the "const" attribute makes it unnecessary
5116 to call mark_call_clobbered. */
5117 gcc_assert (TREE_READONLY (builtin_decl));
5120 if (alignment_support_scheme == dr_explicit_realign)
5121 return msq;
5123 gcc_assert (!compute_in_loop);
5124 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5127 /* 5. Create msq = phi <msq_init, lsq> in loop */
5129 pe = loop_preheader_edge (containing_loop);
5130 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5131 msq = make_ssa_name (vec_dest);
5132 phi_stmt = create_phi_node (msq, containing_loop->header);
5133 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5135 return msq;
5139 /* Function vect_grouped_load_supported.
5141 Returns TRUE if even and odd permutations are supported,
5142 and FALSE otherwise. */
5144 bool
5145 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5147 machine_mode mode = TYPE_MODE (vectype);
5149 /* vect_permute_load_chain requires the group size to be equal to 3 or
5150 be a power of two. */
5151 if (count != 3 && exact_log2 (count) == -1)
5153 if (dump_enabled_p ())
5154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5155 "the size of the group of accesses"
5156 " is not a power of 2 or not equal to 3\n");
5157 return false;
5160 /* Check that the permutation is supported. */
5161 if (VECTOR_MODE_P (mode))
5163 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5164 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5166 if (count == 3)
5168 unsigned int k;
5169 for (k = 0; k < 3; k++)
5171 for (i = 0; i < nelt; i++)
5172 if (3 * i + k < 2 * nelt)
5173 sel[i] = 3 * i + k;
5174 else
5175 sel[i] = 0;
5176 if (!can_vec_perm_p (mode, false, sel))
5178 if (dump_enabled_p ())
5179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5180 "shuffle of 3 loads is not supported by"
5181 " target\n");
5182 return false;
5184 for (i = 0, j = 0; i < nelt; i++)
5185 if (3 * i + k < 2 * nelt)
5186 sel[i] = i;
5187 else
5188 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5189 if (!can_vec_perm_p (mode, false, sel))
5191 if (dump_enabled_p ())
5192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5193 "shuffle of 3 loads is not supported by"
5194 " target\n");
5195 return false;
5198 return true;
5200 else
5202 /* If length is not equal to 3 then only power of 2 is supported. */
5203 gcc_assert (exact_log2 (count) != -1);
5204 for (i = 0; i < nelt; i++)
5205 sel[i] = i * 2;
5206 if (can_vec_perm_p (mode, false, sel))
5208 for (i = 0; i < nelt; i++)
5209 sel[i] = i * 2 + 1;
5210 if (can_vec_perm_p (mode, false, sel))
5211 return true;
5216 if (dump_enabled_p ())
5217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5218 "extract even/odd not supported by target\n");
5219 return false;
5222 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5223 type VECTYPE. */
5225 bool
5226 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5228 return vect_lanes_optab_supported_p ("vec_load_lanes",
5229 vec_load_lanes_optab,
5230 vectype, count);
5233 /* Function vect_permute_load_chain.
5235 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5236 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5237 the input data correctly. Return the final references for loads in
5238 RESULT_CHAIN.
5240 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5241 The input is 4 vectors each containing 8 elements. We assign a number to each
5242 element, the input sequence is:
5244 1st vec: 0 1 2 3 4 5 6 7
5245 2nd vec: 8 9 10 11 12 13 14 15
5246 3rd vec: 16 17 18 19 20 21 22 23
5247 4th vec: 24 25 26 27 28 29 30 31
5249 The output sequence should be:
5251 1st vec: 0 4 8 12 16 20 24 28
5252 2nd vec: 1 5 9 13 17 21 25 29
5253 3rd vec: 2 6 10 14 18 22 26 30
5254 4th vec: 3 7 11 15 19 23 27 31
5256 i.e., the first output vector should contain the first elements of each
5257 interleaving group, etc.
5259 We use extract_even/odd instructions to create such output. The input of
5260 each extract_even/odd operation is two vectors
5261 1st vec 2nd vec
5262 0 1 2 3 4 5 6 7
5264 and the output is the vector of extracted even/odd elements. The output of
5265 extract_even will be: 0 2 4 6
5266 and of extract_odd: 1 3 5 7
5269 The permutation is done in log LENGTH stages. In each stage extract_even
5270 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5271 their order. In our example,
5273 E1: extract_even (1st vec, 2nd vec)
5274 E2: extract_odd (1st vec, 2nd vec)
5275 E3: extract_even (3rd vec, 4th vec)
5276 E4: extract_odd (3rd vec, 4th vec)
5278 The output for the first stage will be:
5280 E1: 0 2 4 6 8 10 12 14
5281 E2: 1 3 5 7 9 11 13 15
5282 E3: 16 18 20 22 24 26 28 30
5283 E4: 17 19 21 23 25 27 29 31
5285 In order to proceed and create the correct sequence for the next stage (or
5286 for the correct output, if the second stage is the last one, as in our
5287 example), we first put the output of extract_even operation and then the
5288 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5289 The input for the second stage is:
5291 1st vec (E1): 0 2 4 6 8 10 12 14
5292 2nd vec (E3): 16 18 20 22 24 26 28 30
5293 3rd vec (E2): 1 3 5 7 9 11 13 15
5294 4th vec (E4): 17 19 21 23 25 27 29 31
5296 The output of the second stage:
5298 E1: 0 4 8 12 16 20 24 28
5299 E2: 2 6 10 14 18 22 26 30
5300 E3: 1 5 9 13 17 21 25 29
5301 E4: 3 7 11 15 19 23 27 31
5303 And RESULT_CHAIN after reordering:
5305 1st vec (E1): 0 4 8 12 16 20 24 28
5306 2nd vec (E3): 1 5 9 13 17 21 25 29
5307 3rd vec (E2): 2 6 10 14 18 22 26 30
5308 4th vec (E4): 3 7 11 15 19 23 27 31. */
5310 static void
5311 vect_permute_load_chain (vec<tree> dr_chain,
5312 unsigned int length,
5313 gimple *stmt,
5314 gimple_stmt_iterator *gsi,
5315 vec<tree> *result_chain)
5317 tree data_ref, first_vect, second_vect;
5318 tree perm_mask_even, perm_mask_odd;
5319 tree perm3_mask_low, perm3_mask_high;
5320 gimple *perm_stmt;
5321 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5322 unsigned int i, j, log_length = exact_log2 (length);
5323 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5324 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5326 result_chain->quick_grow (length);
5327 memcpy (result_chain->address (), dr_chain.address (),
5328 length * sizeof (tree));
5330 if (length == 3)
5332 unsigned int k;
5334 for (k = 0; k < 3; k++)
5336 for (i = 0; i < nelt; i++)
5337 if (3 * i + k < 2 * nelt)
5338 sel[i] = 3 * i + k;
5339 else
5340 sel[i] = 0;
5341 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5343 for (i = 0, j = 0; i < nelt; i++)
5344 if (3 * i + k < 2 * nelt)
5345 sel[i] = i;
5346 else
5347 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5349 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5351 first_vect = dr_chain[0];
5352 second_vect = dr_chain[1];
5354 /* Create interleaving stmt (low part of):
5355 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5356 ...}> */
5357 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5358 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5359 second_vect, perm3_mask_low);
5360 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5362 /* Create interleaving stmt (high part of):
5363 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5364 ...}> */
5365 first_vect = data_ref;
5366 second_vect = dr_chain[2];
5367 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5368 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5369 second_vect, perm3_mask_high);
5370 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371 (*result_chain)[k] = data_ref;
5374 else
5376 /* If length is not equal to 3 then only power of 2 is supported. */
5377 gcc_assert (exact_log2 (length) != -1);
5379 for (i = 0; i < nelt; ++i)
5380 sel[i] = i * 2;
5381 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5383 for (i = 0; i < nelt; ++i)
5384 sel[i] = i * 2 + 1;
5385 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5387 for (i = 0; i < log_length; i++)
5389 for (j = 0; j < length; j += 2)
5391 first_vect = dr_chain[j];
5392 second_vect = dr_chain[j+1];
5394 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5395 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5396 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5397 first_vect, second_vect,
5398 perm_mask_even);
5399 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5400 (*result_chain)[j/2] = data_ref;
5402 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5403 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5404 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5405 first_vect, second_vect,
5406 perm_mask_odd);
5407 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5408 (*result_chain)[j/2+length/2] = data_ref;
5410 memcpy (dr_chain.address (), result_chain->address (),
5411 length * sizeof (tree));
5416 /* Function vect_shift_permute_load_chain.
5418 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5419 sequence of stmts to reorder the input data accordingly.
5420 Return the final references for loads in RESULT_CHAIN.
5421 Return true if successed, false otherwise.
5423 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5424 The input is 3 vectors each containing 8 elements. We assign a
5425 number to each element, the input sequence is:
5427 1st vec: 0 1 2 3 4 5 6 7
5428 2nd vec: 8 9 10 11 12 13 14 15
5429 3rd vec: 16 17 18 19 20 21 22 23
5431 The output sequence should be:
5433 1st vec: 0 3 6 9 12 15 18 21
5434 2nd vec: 1 4 7 10 13 16 19 22
5435 3rd vec: 2 5 8 11 14 17 20 23
5437 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5439 First we shuffle all 3 vectors to get correct elements order:
5441 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5442 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5443 3rd vec: (16 19 22) (17 20 23) (18 21)
5445 Next we unite and shift vector 3 times:
5447 1st step:
5448 shift right by 6 the concatenation of:
5449 "1st vec" and "2nd vec"
5450 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5451 "2nd vec" and "3rd vec"
5452 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5453 "3rd vec" and "1st vec"
5454 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5455 | New vectors |
5457 So that now new vectors are:
5459 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5460 2nd vec: (10 13) (16 19 22) (17 20 23)
5461 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5463 2nd step:
5464 shift right by 5 the concatenation of:
5465 "1st vec" and "3rd vec"
5466 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5467 "2nd vec" and "1st vec"
5468 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5469 "3rd vec" and "2nd vec"
5470 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5471 | New vectors |
5473 So that now new vectors are:
5475 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5476 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5477 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5479 3rd step:
5480 shift right by 5 the concatenation of:
5481 "1st vec" and "1st vec"
5482 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5483 shift right by 3 the concatenation of:
5484 "2nd vec" and "2nd vec"
5485 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5486 | New vectors |
5488 So that now all vectors are READY:
5489 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5490 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5491 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5493 This algorithm is faster than one in vect_permute_load_chain if:
5494 1. "shift of a concatination" is faster than general permutation.
5495 This is usually so.
5496 2. The TARGET machine can't execute vector instructions in parallel.
5497 This is because each step of the algorithm depends on previous.
5498 The algorithm in vect_permute_load_chain is much more parallel.
5500 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5503 static bool
5504 vect_shift_permute_load_chain (vec<tree> dr_chain,
5505 unsigned int length,
5506 gimple *stmt,
5507 gimple_stmt_iterator *gsi,
5508 vec<tree> *result_chain)
5510 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5511 tree perm2_mask1, perm2_mask2, perm3_mask;
5512 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5513 gimple *perm_stmt;
5515 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5516 unsigned int i;
5517 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5518 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5519 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5520 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5522 result_chain->quick_grow (length);
5523 memcpy (result_chain->address (), dr_chain.address (),
5524 length * sizeof (tree));
5526 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5528 unsigned int j, log_length = exact_log2 (length);
5529 for (i = 0; i < nelt / 2; ++i)
5530 sel[i] = i * 2;
5531 for (i = 0; i < nelt / 2; ++i)
5532 sel[nelt / 2 + i] = i * 2 + 1;
5533 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5535 if (dump_enabled_p ())
5536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537 "shuffle of 2 fields structure is not \
5538 supported by target\n");
5539 return false;
5541 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5543 for (i = 0; i < nelt / 2; ++i)
5544 sel[i] = i * 2 + 1;
5545 for (i = 0; i < nelt / 2; ++i)
5546 sel[nelt / 2 + i] = i * 2;
5547 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5549 if (dump_enabled_p ())
5550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5551 "shuffle of 2 fields structure is not \
5552 supported by target\n");
5553 return false;
5555 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5557 /* Generating permutation constant to shift all elements.
5558 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5559 for (i = 0; i < nelt; i++)
5560 sel[i] = nelt / 2 + i;
5561 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5563 if (dump_enabled_p ())
5564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5565 "shift permutation is not supported by target\n");
5566 return false;
5568 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5570 /* Generating permutation constant to select vector from 2.
5571 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5572 for (i = 0; i < nelt / 2; i++)
5573 sel[i] = i;
5574 for (i = nelt / 2; i < nelt; i++)
5575 sel[i] = nelt + i;
5576 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5578 if (dump_enabled_p ())
5579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5580 "select is not supported by target\n");
5581 return false;
5583 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5585 for (i = 0; i < log_length; i++)
5587 for (j = 0; j < length; j += 2)
5589 first_vect = dr_chain[j];
5590 second_vect = dr_chain[j + 1];
5592 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5593 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5594 first_vect, first_vect,
5595 perm2_mask1);
5596 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5597 vect[0] = data_ref;
5599 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5600 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5601 second_vect, second_vect,
5602 perm2_mask2);
5603 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5604 vect[1] = data_ref;
5606 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5607 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5608 vect[0], vect[1], shift1_mask);
5609 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5610 (*result_chain)[j/2 + length/2] = data_ref;
5612 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5613 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5614 vect[0], vect[1], select_mask);
5615 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5616 (*result_chain)[j/2] = data_ref;
5618 memcpy (dr_chain.address (), result_chain->address (),
5619 length * sizeof (tree));
5621 return true;
5623 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5625 unsigned int k = 0, l = 0;
5627 /* Generating permutation constant to get all elements in rigth order.
5628 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5629 for (i = 0; i < nelt; i++)
5631 if (3 * k + (l % 3) >= nelt)
5633 k = 0;
5634 l += (3 - (nelt % 3));
5636 sel[i] = 3 * k + (l % 3);
5637 k++;
5639 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5643 "shuffle of 3 fields structure is not \
5644 supported by target\n");
5645 return false;
5647 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5649 /* Generating permutation constant to shift all elements.
5650 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5651 for (i = 0; i < nelt; i++)
5652 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5653 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5655 if (dump_enabled_p ())
5656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5657 "shift permutation is not supported by target\n");
5658 return false;
5660 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5662 /* Generating permutation constant to shift all elements.
5663 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5664 for (i = 0; i < nelt; i++)
5665 sel[i] = 2 * (nelt / 3) + 1 + i;
5666 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5668 if (dump_enabled_p ())
5669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5670 "shift permutation is not supported by target\n");
5671 return false;
5673 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5675 /* Generating permutation constant to shift all elements.
5676 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5677 for (i = 0; i < nelt; i++)
5678 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5679 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5681 if (dump_enabled_p ())
5682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5683 "shift permutation is not supported by target\n");
5684 return false;
5686 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5688 /* Generating permutation constant to shift all elements.
5689 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5690 for (i = 0; i < nelt; i++)
5691 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5692 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5694 if (dump_enabled_p ())
5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5696 "shift permutation is not supported by target\n");
5697 return false;
5699 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5701 for (k = 0; k < 3; k++)
5703 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5704 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5705 dr_chain[k], dr_chain[k],
5706 perm3_mask);
5707 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5708 vect[k] = data_ref;
5711 for (k = 0; k < 3; k++)
5713 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5714 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5715 vect[k % 3], vect[(k + 1) % 3],
5716 shift1_mask);
5717 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5718 vect_shift[k] = data_ref;
5721 for (k = 0; k < 3; k++)
5723 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5724 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5725 vect_shift[(4 - k) % 3],
5726 vect_shift[(3 - k) % 3],
5727 shift2_mask);
5728 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5729 vect[k] = data_ref;
5732 (*result_chain)[3 - (nelt % 3)] = vect[2];
5734 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5735 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5736 vect[0], shift3_mask);
5737 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5738 (*result_chain)[nelt % 3] = data_ref;
5740 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5741 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5742 vect[1], shift4_mask);
5743 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5744 (*result_chain)[0] = data_ref;
5745 return true;
5747 return false;
5750 /* Function vect_transform_grouped_load.
5752 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5753 to perform their permutation and ascribe the result vectorized statements to
5754 the scalar statements.
5757 void
5758 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5759 gimple_stmt_iterator *gsi)
5761 machine_mode mode;
5762 vec<tree> result_chain = vNULL;
5764 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5765 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5766 vectors, that are ready for vector computation. */
5767 result_chain.create (size);
5769 /* If reassociation width for vector type is 2 or greater target machine can
5770 execute 2 or more vector instructions in parallel. Otherwise try to
5771 get chain for loads group using vect_shift_permute_load_chain. */
5772 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5773 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5774 || exact_log2 (size) != -1
5775 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5776 gsi, &result_chain))
5777 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5778 vect_record_grouped_load_vectors (stmt, result_chain);
5779 result_chain.release ();
5782 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5783 generated as part of the vectorization of STMT. Assign the statement
5784 for each vector to the associated scalar statement. */
5786 void
5787 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5789 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5790 gimple *next_stmt, *new_stmt;
5791 unsigned int i, gap_count;
5792 tree tmp_data_ref;
5794 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5795 Since we scan the chain starting from it's first node, their order
5796 corresponds the order of data-refs in RESULT_CHAIN. */
5797 next_stmt = first_stmt;
5798 gap_count = 1;
5799 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5801 if (!next_stmt)
5802 break;
5804 /* Skip the gaps. Loads created for the gaps will be removed by dead
5805 code elimination pass later. No need to check for the first stmt in
5806 the group, since it always exists.
5807 GROUP_GAP is the number of steps in elements from the previous
5808 access (if there is no gap GROUP_GAP is 1). We skip loads that
5809 correspond to the gaps. */
5810 if (next_stmt != first_stmt
5811 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5813 gap_count++;
5814 continue;
5817 while (next_stmt)
5819 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5820 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5821 copies, and we put the new vector statement in the first available
5822 RELATED_STMT. */
5823 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5824 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5825 else
5827 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5829 gimple *prev_stmt =
5830 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5831 gimple *rel_stmt =
5832 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5833 while (rel_stmt)
5835 prev_stmt = rel_stmt;
5836 rel_stmt =
5837 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5840 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5841 new_stmt;
5845 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5846 gap_count = 1;
5847 /* If NEXT_STMT accesses the same DR as the previous statement,
5848 put the same TMP_DATA_REF as its vectorized statement; otherwise
5849 get the next data-ref from RESULT_CHAIN. */
5850 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5851 break;
5856 /* Function vect_force_dr_alignment_p.
5858 Returns whether the alignment of a DECL can be forced to be aligned
5859 on ALIGNMENT bit boundary. */
5861 bool
5862 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5864 if (TREE_CODE (decl) != VAR_DECL)
5865 return false;
5867 if (decl_in_symtab_p (decl)
5868 && !symtab_node::get (decl)->can_increase_alignment_p ())
5869 return false;
5871 if (TREE_STATIC (decl))
5872 return (alignment <= MAX_OFILE_ALIGNMENT);
5873 else
5874 return (alignment <= MAX_STACK_ALIGNMENT);
5878 /* Return whether the data reference DR is supported with respect to its
5879 alignment.
5880 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5881 it is aligned, i.e., check if it is possible to vectorize it with different
5882 alignment. */
5884 enum dr_alignment_support
5885 vect_supportable_dr_alignment (struct data_reference *dr,
5886 bool check_aligned_accesses)
5888 gimple *stmt = DR_STMT (dr);
5889 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5890 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5891 machine_mode mode = TYPE_MODE (vectype);
5892 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5893 struct loop *vect_loop = NULL;
5894 bool nested_in_vect_loop = false;
5896 if (aligned_access_p (dr) && !check_aligned_accesses)
5897 return dr_aligned;
5899 /* For now assume all conditional loads/stores support unaligned
5900 access without any special code. */
5901 if (is_gimple_call (stmt)
5902 && gimple_call_internal_p (stmt)
5903 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5904 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5905 return dr_unaligned_supported;
5907 if (loop_vinfo)
5909 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5910 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5913 /* Possibly unaligned access. */
5915 /* We can choose between using the implicit realignment scheme (generating
5916 a misaligned_move stmt) and the explicit realignment scheme (generating
5917 aligned loads with a REALIGN_LOAD). There are two variants to the
5918 explicit realignment scheme: optimized, and unoptimized.
5919 We can optimize the realignment only if the step between consecutive
5920 vector loads is equal to the vector size. Since the vector memory
5921 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5922 is guaranteed that the misalignment amount remains the same throughout the
5923 execution of the vectorized loop. Therefore, we can create the
5924 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5925 at the loop preheader.
5927 However, in the case of outer-loop vectorization, when vectorizing a
5928 memory access in the inner-loop nested within the LOOP that is now being
5929 vectorized, while it is guaranteed that the misalignment of the
5930 vectorized memory access will remain the same in different outer-loop
5931 iterations, it is *not* guaranteed that is will remain the same throughout
5932 the execution of the inner-loop. This is because the inner-loop advances
5933 with the original scalar step (and not in steps of VS). If the inner-loop
5934 step happens to be a multiple of VS, then the misalignment remains fixed
5935 and we can use the optimized realignment scheme. For example:
5937 for (i=0; i<N; i++)
5938 for (j=0; j<M; j++)
5939 s += a[i+j];
5941 When vectorizing the i-loop in the above example, the step between
5942 consecutive vector loads is 1, and so the misalignment does not remain
5943 fixed across the execution of the inner-loop, and the realignment cannot
5944 be optimized (as illustrated in the following pseudo vectorized loop):
5946 for (i=0; i<N; i+=4)
5947 for (j=0; j<M; j++){
5948 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5949 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5950 // (assuming that we start from an aligned address).
5953 We therefore have to use the unoptimized realignment scheme:
5955 for (i=0; i<N; i+=4)
5956 for (j=k; j<M; j+=4)
5957 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5958 // that the misalignment of the initial address is
5959 // 0).
5961 The loop can then be vectorized as follows:
5963 for (k=0; k<4; k++){
5964 rt = get_realignment_token (&vp[k]);
5965 for (i=0; i<N; i+=4){
5966 v1 = vp[i+k];
5967 for (j=k; j<M; j+=4){
5968 v2 = vp[i+j+VS-1];
5969 va = REALIGN_LOAD <v1,v2,rt>;
5970 vs += va;
5971 v1 = v2;
5974 } */
5976 if (DR_IS_READ (dr))
5978 bool is_packed = false;
5979 tree type = (TREE_TYPE (DR_REF (dr)));
5981 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5982 && (!targetm.vectorize.builtin_mask_for_load
5983 || targetm.vectorize.builtin_mask_for_load ()))
5985 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5986 if ((nested_in_vect_loop
5987 && (TREE_INT_CST_LOW (DR_STEP (dr))
5988 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5989 || !loop_vinfo)
5990 return dr_explicit_realign;
5991 else
5992 return dr_explicit_realign_optimized;
5994 if (!known_alignment_for_access_p (dr))
5995 is_packed = not_size_aligned (DR_REF (dr));
5997 if ((TYPE_USER_ALIGN (type) && !is_packed)
5998 || targetm.vectorize.
5999 support_vector_misalignment (mode, type,
6000 DR_MISALIGNMENT (dr), is_packed))
6001 /* Can't software pipeline the loads, but can at least do them. */
6002 return dr_unaligned_supported;
6004 else
6006 bool is_packed = false;
6007 tree type = (TREE_TYPE (DR_REF (dr)));
6009 if (!known_alignment_for_access_p (dr))
6010 is_packed = not_size_aligned (DR_REF (dr));
6012 if ((TYPE_USER_ALIGN (type) && !is_packed)
6013 || targetm.vectorize.
6014 support_vector_misalignment (mode, type,
6015 DR_MISALIGNMENT (dr), is_packed))
6016 return dr_unaligned_supported;
6019 /* Unsupported. */
6020 return dr_unaligned_unsupported;