* testsuite/libgomp.oacc-c-c++-common/collapse-2.c: Sequential
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobf9327d7d89901ee4be8827be3a0368acbc70795e
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
270 return true;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 return true;
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
344 if (dist == 0)
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
390 continue;
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
444 return true;
447 return false;
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
459 unsigned int i;
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
479 return true;
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
503 if (dra == drb)
504 return false;
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
554 group. */
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
560 continue;
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 continue;
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 if (!dr_b)
576 return false;
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
585 continue;
586 unsigned i;
587 gimple *store;
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
597 return false;
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
607 return false;
609 free_dependence_relation (ddr);
612 return true;
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632 store = NULL;
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
636 if (store)
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 return false;
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
647 bool res = true;
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
651 slp_tree load;
652 unsigned int i;
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
655 store
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
659 res = false;
660 break;
663 /* Unset the visited flag. */
664 if (store)
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
668 return res;
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
675 Output:
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
691 tree vectype;
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
694 tree aligned_to;
695 unsigned HOST_WIDE_INT alignment;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE, vect_location,
699 "vect_compute_data_ref_alignment:\n");
701 if (loop_vinfo)
702 loop = LOOP_VINFO_LOOP (loop_vinfo);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr)))
708 misalign = DR_INIT (dr);
709 aligned_to = DR_ALIGNED_TO (dr);
710 base_addr = DR_BASE_ADDRESS (dr);
711 vectype = STMT_VINFO_VECTYPE (stmt_info);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop && nested_in_vect_loop_p (loop, stmt))
721 tree step = DR_STEP (dr);
723 if (tree_fits_shwi_p (step)
724 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "inner step divides the vector-size.\n");
729 misalign = STMT_VINFO_DR_INIT (stmt_info);
730 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
731 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
733 else
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "inner step doesn't divide the vector-size.\n");
738 misalign = NULL_TREE;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
746 else
748 tree step = DR_STEP (dr);
749 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
751 if (tree_fits_shwi_p (step)
752 && ((tree_to_shwi (step) * vf)
753 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "step doesn't divide the vector-size.\n");
758 misalign = NULL_TREE;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
764 base = ref;
765 while (handled_component_p (base))
766 base = TREE_OPERAND (base, 0);
767 if (TREE_CODE (base) == MEM_REF)
768 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
770 unsigned int base_alignment = get_object_alignment (base);
772 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
773 DR_VECT_AUX (dr)->base_element_aligned = true;
775 alignment = TYPE_ALIGN_UNIT (vectype);
777 if ((compare_tree_int (aligned_to, alignment) < 0)
778 || !misalign)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
790 if (base_alignment < TYPE_ALIGN (vectype))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base, 1))
795 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
796 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
805 dump_printf (MSG_NOTE, "\n");
807 return true;
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
817 dump_printf (MSG_NOTE, "\n");
820 DR_VECT_AUX (dr)->base_decl = base;
821 DR_VECT_AUX (dr)->base_misaligned = true;
822 DR_VECT_AUX (dr)->base_element_aligned = true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
830 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
834 /* PLUS because DR_STEP was negative. */
835 misalign = size_binop (PLUS_EXPR, misalign, offset);
838 SET_DR_MISALIGNMENT (dr,
839 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
849 return true;
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
861 static void
862 vect_update_misalignment_for_peel (struct data_reference *dr,
863 struct data_reference *dr_peel, int npeel)
865 unsigned int i;
866 vec<dr_p> same_align_drs;
867 struct data_reference *current_dr;
868 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
869 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
870 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
871 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
876 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
878 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
882 same_align_drs
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
884 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
886 if (current_dr != dr)
887 continue;
888 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
889 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
890 SET_DR_MISALIGNMENT (dr, 0);
891 return;
894 if (known_alignment_for_access_p (dr)
895 && known_alignment_for_access_p (dr_peel))
897 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
898 int misal = DR_MISALIGNMENT (dr);
899 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
900 misal += negative ? -npeel * dr_size : npeel * dr_size;
901 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
902 SET_DR_MISALIGNMENT (dr, misal);
903 return;
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
916 static bool
917 verify_data_ref_alignment (data_reference_p dr)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr, false);
921 if (!supportable_dr_alignment)
923 if (dump_enabled_p ())
925 if (DR_IS_READ (dr))
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
927 "not vectorized: unsupported unaligned load.");
928 else
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "not vectorized: unsupported unaligned "
931 "store.");
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
934 DR_REF (dr));
935 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
937 return false;
940 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location,
942 "Vectorizing an unaligned access.\n");
944 return true;
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
952 bool
953 vect_verify_datarefs_alignment (loop_vec_info vinfo)
955 vec<data_reference_p> datarefs = vinfo->datarefs;
956 struct data_reference *dr;
957 unsigned int i;
959 FOR_EACH_VEC_ELT (datarefs, i, dr)
961 gimple *stmt = DR_STMT (dr);
962 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
965 continue;
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
969 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
970 return true;
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
976 return true;
978 if (! verify_data_ref_alignment (dr))
979 return false;
982 return true;
985 /* Given an memory reference EXP return whether its alignment is less
986 than its size. */
988 static bool
989 not_size_aligned (tree exp)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
992 return true;
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
995 > get_object_alignment (exp));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1003 static bool
1004 vector_alignment_reachable_p (struct data_reference *dr)
1006 gimple *stmt = DR_STMT (dr);
1007 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size, mis_in_elements;
1016 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr))
1020 return false;
1022 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1023 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1025 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1026 return false;
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1033 HOST_WIDE_INT elmsize =
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE, vect_location,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1039 dump_printf (MSG_NOTE,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1042 if (DR_MISALIGNMENT (dr) % elmsize)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "data size does not divide the misalignment.\n");
1047 return false;
1051 if (!known_alignment_for_access_p (dr))
1053 tree type = TREE_TYPE (DR_REF (dr));
1054 bool is_packed = not_size_aligned (DR_REF (dr));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Unknown misalignment, is_packed = %d\n",is_packed);
1058 if ((TYPE_USER_ALIGN (type) && !is_packed)
1059 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1060 return true;
1061 else
1062 return false;
1065 return true;
1069 /* Calculate the cost of the memory access represented by DR. */
1071 static void
1072 vect_get_data_access_cost (struct data_reference *dr,
1073 unsigned int *inside_cost,
1074 unsigned int *outside_cost,
1075 stmt_vector_for_cost *body_cost_vec)
1077 gimple *stmt = DR_STMT (dr);
1078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1079 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1081 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1082 int ncopies = vf / nunits;
1084 if (DR_IS_READ (dr))
1085 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1086 NULL, body_cost_vec, false);
1087 else
1088 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1097 typedef struct _vect_peel_info
1099 int npeel;
1100 struct data_reference *dr;
1101 unsigned int count;
1102 } *vect_peel_info;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info;
1107 unsigned int inside_cost;
1108 unsigned int outside_cost;
1109 stmt_vector_for_cost body_cost_vec;
1110 } *vect_peel_extended_info;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1117 static inline hashval_t hash (const _vect_peel_info *);
1118 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1121 inline hashval_t
1122 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1124 return (hashval_t) peel_info->npeel;
1127 inline bool
1128 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1130 return (a->npeel == b->npeel);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1136 static void
1137 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1138 loop_vec_info loop_vinfo, struct data_reference *dr,
1139 int npeel)
1141 struct _vect_peel_info elem, *slot;
1142 _vect_peel_info **new_slot;
1143 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1145 elem.npeel = npeel;
1146 slot = peeling_htab->find (&elem);
1147 if (slot)
1148 slot->count++;
1149 else
1151 slot = XNEW (struct _vect_peel_info);
1152 slot->npeel = npeel;
1153 slot->dr = dr;
1154 slot->count = 1;
1155 new_slot = peeling_htab->find_slot (slot, INSERT);
1156 *new_slot = slot;
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1161 slot->count += VECT_MAX_COST;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1170 _vect_peel_extended_info *max)
1172 vect_peel_info elem = *slot;
1174 if (elem->count > max->peel_info.count
1175 || (elem->count == max->peel_info.count
1176 && max->peel_info.npeel > elem->npeel))
1178 max->peel_info.npeel = elem->npeel;
1179 max->peel_info.count = elem->count;
1180 max->peel_info.dr = elem->dr;
1183 return 1;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1192 _vect_peel_extended_info *min)
1194 vect_peel_info elem = *slot;
1195 int save_misalignment, dummy;
1196 unsigned int inside_cost = 0, outside_cost = 0, i;
1197 gimple *stmt = DR_STMT (elem->dr);
1198 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1199 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1200 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1201 struct data_reference *dr;
1202 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1204 prologue_cost_vec.create (2);
1205 body_cost_vec.create (2);
1206 epilogue_cost_vec.create (2);
1208 FOR_EACH_VEC_ELT (datarefs, i, dr)
1210 stmt = DR_STMT (dr);
1211 stmt_info = vinfo_for_stmt (stmt);
1212 /* For interleaving, only the alignment of the first access
1213 matters. */
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1215 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1216 continue;
1218 save_misalignment = DR_MISALIGNMENT (dr);
1219 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1220 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1221 &body_cost_vec);
1222 SET_DR_MISALIGNMENT (dr, save_misalignment);
1225 outside_cost += vect_get_known_peeling_cost
1226 (loop_vinfo, elem->npeel, &dummy,
1227 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1228 &prologue_cost_vec, &epilogue_cost_vec);
1230 /* Prologue and epilogue costs are added to the target model later.
1231 These costs depend only on the scalar iteration cost, the
1232 number of peeling iterations finally chosen, and the number of
1233 misaligned statements. So discard the information found here. */
1234 prologue_cost_vec.release ();
1235 epilogue_cost_vec.release ();
1237 if (inside_cost < min->inside_cost
1238 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1240 min->inside_cost = inside_cost;
1241 min->outside_cost = outside_cost;
1242 min->body_cost_vec.release ();
1243 min->body_cost_vec = body_cost_vec;
1244 min->peel_info.dr = elem->dr;
1245 min->peel_info.npeel = elem->npeel;
1247 else
1248 body_cost_vec.release ();
1250 return 1;
1254 /* Choose best peeling option by traversing peeling hash table and either
1255 choosing an option with the lowest cost (if cost model is enabled) or the
1256 option that aligns as many accesses as possible. */
1258 static struct data_reference *
1259 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1260 loop_vec_info loop_vinfo,
1261 unsigned int *npeel,
1262 stmt_vector_for_cost *body_cost_vec)
1264 struct _vect_peel_extended_info res;
1266 res.peel_info.dr = NULL;
1267 res.body_cost_vec = stmt_vector_for_cost ();
1269 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1271 res.inside_cost = INT_MAX;
1272 res.outside_cost = INT_MAX;
1273 peeling_htab->traverse <_vect_peel_extended_info *,
1274 vect_peeling_hash_get_lowest_cost> (&res);
1276 else
1278 res.peel_info.count = 0;
1279 peeling_htab->traverse <_vect_peel_extended_info *,
1280 vect_peeling_hash_get_most_frequent> (&res);
1283 *npeel = res.peel_info.npeel;
1284 *body_cost_vec = res.body_cost_vec;
1285 return res.peel_info.dr;
1289 /* Function vect_enhance_data_refs_alignment
1291 This pass will use loop versioning and loop peeling in order to enhance
1292 the alignment of data references in the loop.
1294 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1295 original loop is to be vectorized. Any other loops that are created by
1296 the transformations performed in this pass - are not supposed to be
1297 vectorized. This restriction will be relaxed.
1299 This pass will require a cost model to guide it whether to apply peeling
1300 or versioning or a combination of the two. For example, the scheme that
1301 intel uses when given a loop with several memory accesses, is as follows:
1302 choose one memory access ('p') which alignment you want to force by doing
1303 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1304 other accesses are not necessarily aligned, or (2) use loop versioning to
1305 generate one loop in which all accesses are aligned, and another loop in
1306 which only 'p' is necessarily aligned.
1308 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1309 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1310 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1312 Devising a cost model is the most critical aspect of this work. It will
1313 guide us on which access to peel for, whether to use loop versioning, how
1314 many versions to create, etc. The cost model will probably consist of
1315 generic considerations as well as target specific considerations (on
1316 powerpc for example, misaligned stores are more painful than misaligned
1317 loads).
1319 Here are the general steps involved in alignment enhancements:
1321 -- original loop, before alignment analysis:
1322 for (i=0; i<N; i++){
1323 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1324 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1327 -- After vect_compute_data_refs_alignment:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = 3
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- Possibility 1: we do loop versioning:
1334 if (p is aligned) {
1335 for (i=0; i<N; i++){ # loop 1A
1336 x = q[i]; # DR_MISALIGNMENT(q) = 3
1337 p[i] = y; # DR_MISALIGNMENT(p) = 0
1340 else {
1341 for (i=0; i<N; i++){ # loop 1B
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1347 -- Possibility 2: we do loop peeling:
1348 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1349 x = q[i];
1350 p[i] = y;
1352 for (i = 3; i < N; i++){ # loop 2A
1353 x = q[i]; # DR_MISALIGNMENT(q) = 0
1354 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1357 -- Possibility 3: combination of loop peeling and versioning:
1358 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1359 x = q[i];
1360 p[i] = y;
1362 if (p is aligned) {
1363 for (i = 3; i<N; i++){ # loop 3A
1364 x = q[i]; # DR_MISALIGNMENT(q) = 0
1365 p[i] = y; # DR_MISALIGNMENT(p) = 0
1368 else {
1369 for (i = 3; i<N; i++){ # loop 3B
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1375 These loops are later passed to loop_transform to be vectorized. The
1376 vectorizer will use the alignment information to guide the transformation
1377 (whether to generate regular loads/stores, or with special handling for
1378 misalignment). */
1380 bool
1381 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1383 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1384 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1385 enum dr_alignment_support supportable_dr_alignment;
1386 struct data_reference *dr0 = NULL, *first_store = NULL;
1387 struct data_reference *dr;
1388 unsigned int i, j;
1389 bool do_peeling = false;
1390 bool do_versioning = false;
1391 bool stat;
1392 gimple *stmt;
1393 stmt_vec_info stmt_info;
1394 unsigned int npeel = 0;
1395 bool all_misalignments_unknown = true;
1396 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1397 unsigned possible_npeel_number = 1;
1398 tree vectype;
1399 unsigned int nelements, mis, same_align_drs_max = 0;
1400 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1401 hash_table<peel_info_hasher> peeling_htab (1);
1403 if (dump_enabled_p ())
1404 dump_printf_loc (MSG_NOTE, vect_location,
1405 "=== vect_enhance_data_refs_alignment ===\n");
1407 /* Reset data so we can safely be called multiple times. */
1408 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1409 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1411 /* While cost model enhancements are expected in the future, the high level
1412 view of the code at this time is as follows:
1414 A) If there is a misaligned access then see if peeling to align
1415 this access can make all data references satisfy
1416 vect_supportable_dr_alignment. If so, update data structures
1417 as needed and return true.
1419 B) If peeling wasn't possible and there is a data reference with an
1420 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1421 then see if loop versioning checks can be used to make all data
1422 references satisfy vect_supportable_dr_alignment. If so, update
1423 data structures as needed and return true.
1425 C) If neither peeling nor versioning were successful then return false if
1426 any data reference does not satisfy vect_supportable_dr_alignment.
1428 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1430 Note, Possibility 3 above (which is peeling and versioning together) is not
1431 being done at this time. */
1433 /* (1) Peeling to force alignment. */
1435 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1436 Considerations:
1437 + How many accesses will become aligned due to the peeling
1438 - How many accesses will become unaligned due to the peeling,
1439 and the cost of misaligned accesses.
1440 - The cost of peeling (the extra runtime checks, the increase
1441 in code size). */
1443 FOR_EACH_VEC_ELT (datarefs, i, dr)
1445 stmt = DR_STMT (dr);
1446 stmt_info = vinfo_for_stmt (stmt);
1448 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1449 continue;
1451 /* For interleaving, only the alignment of the first access
1452 matters. */
1453 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1454 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1455 continue;
1457 /* For invariant accesses there is nothing to enhance. */
1458 if (integer_zerop (DR_STEP (dr)))
1459 continue;
1461 /* Strided accesses perform only component accesses, alignment is
1462 irrelevant for them. */
1463 if (STMT_VINFO_STRIDED_P (stmt_info)
1464 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1465 continue;
1467 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1468 do_peeling = vector_alignment_reachable_p (dr);
1469 if (do_peeling)
1471 if (known_alignment_for_access_p (dr))
1473 unsigned int npeel_tmp;
1474 bool negative = tree_int_cst_compare (DR_STEP (dr),
1475 size_zero_node) < 0;
1477 /* Save info about DR in the hash table. */
1478 vectype = STMT_VINFO_VECTYPE (stmt_info);
1479 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1480 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1481 TREE_TYPE (DR_REF (dr))));
1482 npeel_tmp = (negative
1483 ? (mis - nelements) : (nelements - mis))
1484 & (nelements - 1);
1486 /* For multiple types, it is possible that the bigger type access
1487 will have more than one peeling option. E.g., a loop with two
1488 types: one of size (vector size / 4), and the other one of
1489 size (vector size / 8). Vectorization factor will 8. If both
1490 access are misaligned by 3, the first one needs one scalar
1491 iteration to be aligned, and the second one needs 5. But the
1492 the first one will be aligned also by peeling 5 scalar
1493 iterations, and in that case both accesses will be aligned.
1494 Hence, except for the immediate peeling amount, we also want
1495 to try to add full vector size, while we don't exceed
1496 vectorization factor.
1497 We do this automtically for cost model, since we calculate cost
1498 for every peeling option. */
1499 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1501 if (STMT_SLP_TYPE (stmt_info))
1502 possible_npeel_number
1503 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1504 else
1505 possible_npeel_number = vf / nelements;
1508 /* Handle the aligned case. We may decide to align some other
1509 access, making DR unaligned. */
1510 if (DR_MISALIGNMENT (dr) == 0)
1512 npeel_tmp = 0;
1513 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514 possible_npeel_number++;
1517 for (j = 0; j < possible_npeel_number; j++)
1519 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1520 dr, npeel_tmp);
1521 npeel_tmp += nelements;
1524 all_misalignments_unknown = false;
1525 /* Data-ref that was chosen for the case that all the
1526 misalignments are unknown is not relevant anymore, since we
1527 have a data-ref with known alignment. */
1528 dr0 = NULL;
1530 else
1532 /* If we don't know any misalignment values, we prefer
1533 peeling for data-ref that has the maximum number of data-refs
1534 with the same alignment, unless the target prefers to align
1535 stores over load. */
1536 if (all_misalignments_unknown)
1538 unsigned same_align_drs
1539 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1540 if (!dr0
1541 || same_align_drs_max < same_align_drs)
1543 same_align_drs_max = same_align_drs;
1544 dr0 = dr;
1546 /* For data-refs with the same number of related
1547 accesses prefer the one where the misalign
1548 computation will be invariant in the outermost loop. */
1549 else if (same_align_drs_max == same_align_drs)
1551 struct loop *ivloop0, *ivloop;
1552 ivloop0 = outermost_invariant_loop_for_expr
1553 (loop, DR_BASE_ADDRESS (dr0));
1554 ivloop = outermost_invariant_loop_for_expr
1555 (loop, DR_BASE_ADDRESS (dr));
1556 if ((ivloop && !ivloop0)
1557 || (ivloop && ivloop0
1558 && flow_loop_nested_p (ivloop, ivloop0)))
1559 dr0 = dr;
1562 if (!first_store && DR_IS_WRITE (dr))
1563 first_store = dr;
1566 /* If there are both known and unknown misaligned accesses in the
1567 loop, we choose peeling amount according to the known
1568 accesses. */
1569 if (!supportable_dr_alignment)
1571 dr0 = dr;
1572 if (!first_store && DR_IS_WRITE (dr))
1573 first_store = dr;
1577 else
1579 if (!aligned_access_p (dr))
1581 if (dump_enabled_p ())
1582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1583 "vector alignment may not be reachable\n");
1584 break;
1589 /* Check if we can possibly peel the loop. */
1590 if (!vect_can_advance_ivs_p (loop_vinfo)
1591 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1592 || loop->inner)
1593 do_peeling = false;
1595 if (do_peeling
1596 && all_misalignments_unknown
1597 && vect_supportable_dr_alignment (dr0, false))
1599 /* Check if the target requires to prefer stores over loads, i.e., if
1600 misaligned stores are more expensive than misaligned loads (taking
1601 drs with same alignment into account). */
1602 if (first_store && DR_IS_READ (dr0))
1604 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1605 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1606 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1607 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1608 stmt_vector_for_cost dummy;
1609 dummy.create (2);
1611 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1612 &dummy);
1613 vect_get_data_access_cost (first_store, &store_inside_cost,
1614 &store_outside_cost, &dummy);
1616 dummy.release ();
1618 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1619 aligning the load DR0). */
1620 load_inside_penalty = store_inside_cost;
1621 load_outside_penalty = store_outside_cost;
1622 for (i = 0;
1623 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1624 DR_STMT (first_store))).iterate (i, &dr);
1625 i++)
1626 if (DR_IS_READ (dr))
1628 load_inside_penalty += load_inside_cost;
1629 load_outside_penalty += load_outside_cost;
1631 else
1633 load_inside_penalty += store_inside_cost;
1634 load_outside_penalty += store_outside_cost;
1637 /* Calculate the penalty for leaving DR0 unaligned (by
1638 aligning the FIRST_STORE). */
1639 store_inside_penalty = load_inside_cost;
1640 store_outside_penalty = load_outside_cost;
1641 for (i = 0;
1642 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1643 DR_STMT (dr0))).iterate (i, &dr);
1644 i++)
1645 if (DR_IS_READ (dr))
1647 store_inside_penalty += load_inside_cost;
1648 store_outside_penalty += load_outside_cost;
1650 else
1652 store_inside_penalty += store_inside_cost;
1653 store_outside_penalty += store_outside_cost;
1656 if (load_inside_penalty > store_inside_penalty
1657 || (load_inside_penalty == store_inside_penalty
1658 && load_outside_penalty > store_outside_penalty))
1659 dr0 = first_store;
1662 /* In case there are only loads with different unknown misalignments, use
1663 peeling only if it may help to align other accesses in the loop or
1664 if it may help improving load bandwith when we'd end up using
1665 unaligned loads. */
1666 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1667 if (!first_store
1668 && !STMT_VINFO_SAME_ALIGN_REFS (
1669 vinfo_for_stmt (DR_STMT (dr0))).length ()
1670 && (vect_supportable_dr_alignment (dr0, false)
1671 != dr_unaligned_supported
1672 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1673 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1674 do_peeling = false;
1677 if (do_peeling && !dr0)
1679 /* Peeling is possible, but there is no data access that is not supported
1680 unless aligned. So we try to choose the best possible peeling. */
1682 /* We should get here only if there are drs with known misalignment. */
1683 gcc_assert (!all_misalignments_unknown);
1685 /* Choose the best peeling from the hash table. */
1686 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1687 loop_vinfo, &npeel,
1688 &body_cost_vec);
1689 if (!dr0 || !npeel)
1690 do_peeling = false;
1693 if (do_peeling)
1695 stmt = DR_STMT (dr0);
1696 stmt_info = vinfo_for_stmt (stmt);
1697 vectype = STMT_VINFO_VECTYPE (stmt_info);
1698 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1700 if (known_alignment_for_access_p (dr0))
1702 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1703 size_zero_node) < 0;
1704 if (!npeel)
1706 /* Since it's known at compile time, compute the number of
1707 iterations in the peeled loop (the peeling factor) for use in
1708 updating DR_MISALIGNMENT values. The peeling factor is the
1709 vectorization factor minus the misalignment as an element
1710 count. */
1711 mis = DR_MISALIGNMENT (dr0);
1712 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1713 npeel = ((negative ? mis - nelements : nelements - mis)
1714 & (nelements - 1));
1717 /* For interleaved data access every iteration accesses all the
1718 members of the group, therefore we divide the number of iterations
1719 by the group size. */
1720 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1721 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1722 npeel /= GROUP_SIZE (stmt_info);
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "Try peeling by %d\n", npeel);
1729 /* Ensure that all data refs can be vectorized after the peel. */
1730 FOR_EACH_VEC_ELT (datarefs, i, dr)
1732 int save_misalignment;
1734 if (dr == dr0)
1735 continue;
1737 stmt = DR_STMT (dr);
1738 stmt_info = vinfo_for_stmt (stmt);
1739 /* For interleaving, only the alignment of the first access
1740 matters. */
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1743 continue;
1745 /* Strided accesses perform only component accesses, alignment is
1746 irrelevant for them. */
1747 if (STMT_VINFO_STRIDED_P (stmt_info)
1748 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1749 continue;
1751 save_misalignment = DR_MISALIGNMENT (dr);
1752 vect_update_misalignment_for_peel (dr, dr0, npeel);
1753 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1754 SET_DR_MISALIGNMENT (dr, save_misalignment);
1756 if (!supportable_dr_alignment)
1758 do_peeling = false;
1759 break;
1763 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1765 stat = vect_verify_datarefs_alignment (loop_vinfo);
1766 if (!stat)
1767 do_peeling = false;
1768 else
1770 body_cost_vec.release ();
1771 return stat;
1775 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1776 if (do_peeling)
1778 unsigned max_allowed_peel
1779 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1780 if (max_allowed_peel != (unsigned)-1)
1782 unsigned max_peel = npeel;
1783 if (max_peel == 0)
1785 gimple *dr_stmt = DR_STMT (dr0);
1786 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1787 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1788 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1790 if (max_peel > max_allowed_peel)
1792 do_peeling = false;
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_NOTE, vect_location,
1795 "Disable peeling, max peels reached: %d\n", max_peel);
1800 /* Cost model #2 - if peeling may result in a remaining loop not
1801 iterating enough to be vectorized then do not peel. */
1802 if (do_peeling
1803 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1805 unsigned max_peel
1806 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1807 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1808 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1809 do_peeling = false;
1812 if (do_peeling)
1814 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1815 If the misalignment of DR_i is identical to that of dr0 then set
1816 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1817 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1818 by the peeling factor times the element size of DR_i (MOD the
1819 vectorization factor times the size). Otherwise, the
1820 misalignment of DR_i must be set to unknown. */
1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
1822 if (dr != dr0)
1823 vect_update_misalignment_for_peel (dr, dr0, npeel);
1825 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1826 if (npeel)
1827 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1828 else
1829 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1830 = DR_MISALIGNMENT (dr0);
1831 SET_DR_MISALIGNMENT (dr0, 0);
1832 if (dump_enabled_p ())
1834 dump_printf_loc (MSG_NOTE, vect_location,
1835 "Alignment of access forced using peeling.\n");
1836 dump_printf_loc (MSG_NOTE, vect_location,
1837 "Peeling for alignment will be applied.\n");
1839 /* The inside-loop cost will be accounted for in vectorizable_load
1840 and vectorizable_store correctly with adjusted alignments.
1841 Drop the body_cst_vec on the floor here. */
1842 body_cost_vec.release ();
1844 stat = vect_verify_datarefs_alignment (loop_vinfo);
1845 gcc_assert (stat);
1846 return stat;
1850 body_cost_vec.release ();
1852 /* (2) Versioning to force alignment. */
1854 /* Try versioning if:
1855 1) optimize loop for speed
1856 2) there is at least one unsupported misaligned data ref with an unknown
1857 misalignment, and
1858 3) all misaligned data refs with a known misalignment are supported, and
1859 4) the number of runtime alignment checks is within reason. */
1861 do_versioning =
1862 optimize_loop_nest_for_speed_p (loop)
1863 && (!loop->inner); /* FORNOW */
1865 if (do_versioning)
1867 FOR_EACH_VEC_ELT (datarefs, i, dr)
1869 stmt = DR_STMT (dr);
1870 stmt_info = vinfo_for_stmt (stmt);
1872 /* For interleaving, only the alignment of the first access
1873 matters. */
1874 if (aligned_access_p (dr)
1875 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1876 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1877 continue;
1879 if (STMT_VINFO_STRIDED_P (stmt_info))
1881 /* Strided loads perform only component accesses, alignment is
1882 irrelevant for them. */
1883 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1884 continue;
1885 do_versioning = false;
1886 break;
1889 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1891 if (!supportable_dr_alignment)
1893 gimple *stmt;
1894 int mask;
1895 tree vectype;
1897 if (known_alignment_for_access_p (dr)
1898 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1899 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1901 do_versioning = false;
1902 break;
1905 stmt = DR_STMT (dr);
1906 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1907 gcc_assert (vectype);
1909 /* The rightmost bits of an aligned address must be zeros.
1910 Construct the mask needed for this test. For example,
1911 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1912 mask must be 15 = 0xf. */
1913 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1915 /* FORNOW: use the same mask to test all potentially unaligned
1916 references in the loop. The vectorizer currently supports
1917 a single vector size, see the reference to
1918 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1919 vectorization factor is computed. */
1920 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1921 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1922 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1923 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1924 DR_STMT (dr));
1928 /* Versioning requires at least one misaligned data reference. */
1929 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1930 do_versioning = false;
1931 else if (!do_versioning)
1932 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1935 if (do_versioning)
1937 vec<gimple *> may_misalign_stmts
1938 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1939 gimple *stmt;
1941 /* It can now be assumed that the data references in the statements
1942 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1943 of the loop being vectorized. */
1944 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1947 dr = STMT_VINFO_DATA_REF (stmt_info);
1948 SET_DR_MISALIGNMENT (dr, 0);
1949 if (dump_enabled_p ())
1950 dump_printf_loc (MSG_NOTE, vect_location,
1951 "Alignment of access forced using versioning.\n");
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE, vect_location,
1956 "Versioning for alignment will be applied.\n");
1958 /* Peeling and versioning can't be done together at this time. */
1959 gcc_assert (! (do_peeling && do_versioning));
1961 stat = vect_verify_datarefs_alignment (loop_vinfo);
1962 gcc_assert (stat);
1963 return stat;
1966 /* This point is reached if neither peeling nor versioning is being done. */
1967 gcc_assert (! (do_peeling || do_versioning));
1969 stat = vect_verify_datarefs_alignment (loop_vinfo);
1970 return stat;
1974 /* Function vect_find_same_alignment_drs.
1976 Update group and alignment relations according to the chosen
1977 vectorization factor. */
1979 static void
1980 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1981 loop_vec_info loop_vinfo)
1983 unsigned int i;
1984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1985 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1986 struct data_reference *dra = DDR_A (ddr);
1987 struct data_reference *drb = DDR_B (ddr);
1988 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1989 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1990 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1991 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1992 lambda_vector dist_v;
1993 unsigned int loop_depth;
1995 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1996 return;
1998 if (dra == drb)
1999 return;
2001 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2002 return;
2004 /* Loop-based vectorization and known data dependence. */
2005 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2006 return;
2008 /* Data-dependence analysis reports a distance vector of zero
2009 for data-references that overlap only in the first iteration
2010 but have different sign step (see PR45764).
2011 So as a sanity check require equal DR_STEP. */
2012 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2013 return;
2015 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2016 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2018 int dist = dist_v[loop_depth];
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE, vect_location,
2022 "dependence distance = %d.\n", dist);
2024 /* Same loop iteration. */
2025 if (dist == 0
2026 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2028 /* Two references with distance zero have the same alignment. */
2029 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2030 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2031 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "accesses have the same alignment.\n");
2035 dump_printf (MSG_NOTE,
2036 "dependence distance modulo vf == 0 between ");
2037 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2038 dump_printf (MSG_NOTE, " and ");
2039 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2040 dump_printf (MSG_NOTE, "\n");
2047 /* Function vect_analyze_data_refs_alignment
2049 Analyze the alignment of the data-references in the loop.
2050 Return FALSE if a data reference is found that cannot be vectorized. */
2052 bool
2053 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "=== vect_analyze_data_refs_alignment ===\n");
2059 /* Mark groups of data references with same alignment using
2060 data dependence information. */
2061 vec<ddr_p> ddrs = vinfo->ddrs;
2062 struct data_dependence_relation *ddr;
2063 unsigned int i;
2065 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2066 vect_find_same_alignment_drs (ddr, vinfo);
2068 vec<data_reference_p> datarefs = vinfo->datarefs;
2069 struct data_reference *dr;
2071 FOR_EACH_VEC_ELT (datarefs, i, dr)
2073 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2074 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2075 && !vect_compute_data_ref_alignment (dr))
2077 /* Strided accesses perform only component accesses, misalignment
2078 information is irrelevant for them. */
2079 if (STMT_VINFO_STRIDED_P (stmt_info)
2080 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2081 continue;
2083 if (dump_enabled_p ())
2084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2085 "not vectorized: can't calculate alignment "
2086 "for data ref.\n");
2088 return false;
2092 return true;
2096 /* Analyze alignment of DRs of stmts in NODE. */
2098 static bool
2099 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2101 /* We vectorize from the first scalar stmt in the node unless
2102 the node is permuted in which case we start from the first
2103 element in the group. */
2104 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2105 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2106 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2108 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2109 if (! vect_compute_data_ref_alignment (dr)
2110 || ! verify_data_ref_alignment (dr))
2112 if (dump_enabled_p ())
2113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2114 "not vectorized: bad data alignment in basic "
2115 "block.\n");
2116 return false;
2119 return true;
2122 /* Function vect_slp_analyze_instance_alignment
2124 Analyze the alignment of the data-references in the SLP instance.
2125 Return FALSE if a data reference is found that cannot be vectorized. */
2127 bool
2128 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_NOTE, vect_location,
2132 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2134 slp_tree node;
2135 unsigned i;
2136 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2137 if (! vect_slp_analyze_and_verify_node_alignment (node))
2138 return false;
2140 node = SLP_INSTANCE_TREE (instance);
2141 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2142 && ! vect_slp_analyze_and_verify_node_alignment
2143 (SLP_INSTANCE_TREE (instance)))
2144 return false;
2146 return true;
2150 /* Analyze groups of accesses: check that DR belongs to a group of
2151 accesses of legal size, step, etc. Detect gaps, single element
2152 interleaving, and other special cases. Set grouped access info.
2153 Collect groups of strided stores for further use in SLP analysis.
2154 Worker for vect_analyze_group_access. */
2156 static bool
2157 vect_analyze_group_access_1 (struct data_reference *dr)
2159 tree step = DR_STEP (dr);
2160 tree scalar_type = TREE_TYPE (DR_REF (dr));
2161 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2162 gimple *stmt = DR_STMT (dr);
2163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2164 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2165 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2166 HOST_WIDE_INT dr_step = -1;
2167 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2168 bool slp_impossible = false;
2169 struct loop *loop = NULL;
2171 if (loop_vinfo)
2172 loop = LOOP_VINFO_LOOP (loop_vinfo);
2174 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2175 size of the interleaving group (including gaps). */
2176 if (tree_fits_shwi_p (step))
2178 dr_step = tree_to_shwi (step);
2179 groupsize = absu_hwi (dr_step) / type_size;
2181 else
2182 groupsize = 0;
2184 /* Not consecutive access is possible only if it is a part of interleaving. */
2185 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2187 /* Check if it this DR is a part of interleaving, and is a single
2188 element of the group that is accessed in the loop. */
2190 /* Gaps are supported only for loads. STEP must be a multiple of the type
2191 size. The size of the group must be a power of 2. */
2192 if (DR_IS_READ (dr)
2193 && (dr_step % type_size) == 0
2194 && groupsize > 0
2195 && exact_log2 (groupsize) != -1)
2197 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2198 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2199 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_NOTE, vect_location,
2202 "Detected single element interleaving ");
2203 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2204 dump_printf (MSG_NOTE, " step ");
2205 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2206 dump_printf (MSG_NOTE, "\n");
2209 if (loop_vinfo)
2211 if (dump_enabled_p ())
2212 dump_printf_loc (MSG_NOTE, vect_location,
2213 "Data access with gaps requires scalar "
2214 "epilogue loop\n");
2215 if (loop->inner)
2217 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219 "Peeling for outer loop is not"
2220 " supported\n");
2221 return false;
2224 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2227 return true;
2230 if (dump_enabled_p ())
2232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2233 "not consecutive access ");
2234 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2237 if (bb_vinfo)
2239 /* Mark the statement as unvectorizable. */
2240 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2241 return true;
2244 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2245 STMT_VINFO_STRIDED_P (stmt_info) = true;
2246 return true;
2249 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2251 /* First stmt in the interleaving chain. Check the chain. */
2252 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2253 struct data_reference *data_ref = dr;
2254 unsigned int count = 1;
2255 tree prev_init = DR_INIT (data_ref);
2256 gimple *prev = stmt;
2257 HOST_WIDE_INT diff, gaps = 0;
2259 while (next)
2261 /* Skip same data-refs. In case that two or more stmts share
2262 data-ref (supported only for loads), we vectorize only the first
2263 stmt, and the rest get their vectorized loads from the first
2264 one. */
2265 if (!tree_int_cst_compare (DR_INIT (data_ref),
2266 DR_INIT (STMT_VINFO_DATA_REF (
2267 vinfo_for_stmt (next)))))
2269 if (DR_IS_WRITE (data_ref))
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273 "Two store stmts share the same dr.\n");
2274 return false;
2277 if (dump_enabled_p ())
2278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279 "Two or more load stmts share the same dr.\n");
2281 /* For load use the same data-ref load. */
2282 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2284 prev = next;
2285 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2286 continue;
2289 prev = next;
2290 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2292 /* All group members have the same STEP by construction. */
2293 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2295 /* Check that the distance between two accesses is equal to the type
2296 size. Otherwise, we have gaps. */
2297 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2298 - TREE_INT_CST_LOW (prev_init)) / type_size;
2299 if (diff != 1)
2301 /* FORNOW: SLP of accesses with gaps is not supported. */
2302 slp_impossible = true;
2303 if (DR_IS_WRITE (data_ref))
2305 if (dump_enabled_p ())
2306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2307 "interleaved store with gaps\n");
2308 return false;
2311 gaps += diff - 1;
2314 last_accessed_element += diff;
2316 /* Store the gap from the previous member of the group. If there is no
2317 gap in the access, GROUP_GAP is always 1. */
2318 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2320 prev_init = DR_INIT (data_ref);
2321 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2322 /* Count the number of data-refs in the chain. */
2323 count++;
2326 if (groupsize == 0)
2327 groupsize = count + gaps;
2329 if (groupsize > UINT_MAX)
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "group is too large\n");
2334 return false;
2337 /* Check that the size of the interleaving is equal to count for stores,
2338 i.e., that there are no gaps. */
2339 if (groupsize != count
2340 && !DR_IS_READ (dr))
2342 if (dump_enabled_p ())
2343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344 "interleaved store with gaps\n");
2345 return false;
2348 /* If there is a gap after the last load in the group it is the
2349 difference between the groupsize and the last accessed
2350 element.
2351 When there is no gap, this difference should be 0. */
2352 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2354 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2355 if (dump_enabled_p ())
2357 dump_printf_loc (MSG_NOTE, vect_location,
2358 "Detected interleaving ");
2359 if (DR_IS_READ (dr))
2360 dump_printf (MSG_NOTE, "load ");
2361 else
2362 dump_printf (MSG_NOTE, "store ");
2363 dump_printf (MSG_NOTE, "of size %u starting with ",
2364 (unsigned)groupsize);
2365 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2366 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2367 dump_printf_loc (MSG_NOTE, vect_location,
2368 "There is a gap of %u elements after the group\n",
2369 GROUP_GAP (vinfo_for_stmt (stmt)));
2372 /* SLP: create an SLP data structure for every interleaving group of
2373 stores for further analysis in vect_analyse_slp. */
2374 if (DR_IS_WRITE (dr) && !slp_impossible)
2376 if (loop_vinfo)
2377 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2378 if (bb_vinfo)
2379 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2382 /* If there is a gap in the end of the group or the group size cannot
2383 be made a multiple of the vector element count then we access excess
2384 elements in the last iteration and thus need to peel that off. */
2385 if (loop_vinfo
2386 && (groupsize - last_accessed_element > 0
2387 || exact_log2 (groupsize) == -1))
2390 if (dump_enabled_p ())
2391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2392 "Data access with gaps requires scalar "
2393 "epilogue loop\n");
2394 if (loop->inner)
2396 if (dump_enabled_p ())
2397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2398 "Peeling for outer loop is not supported\n");
2399 return false;
2402 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2406 return true;
2409 /* Analyze groups of accesses: check that DR belongs to a group of
2410 accesses of legal size, step, etc. Detect gaps, single element
2411 interleaving, and other special cases. Set grouped access info.
2412 Collect groups of strided stores for further use in SLP analysis. */
2414 static bool
2415 vect_analyze_group_access (struct data_reference *dr)
2417 if (!vect_analyze_group_access_1 (dr))
2419 /* Dissolve the group if present. */
2420 gimple *next;
2421 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2422 while (stmt)
2424 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2425 next = GROUP_NEXT_ELEMENT (vinfo);
2426 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2427 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2428 stmt = next;
2430 return false;
2432 return true;
2435 /* Analyze the access pattern of the data-reference DR.
2436 In case of non-consecutive accesses call vect_analyze_group_access() to
2437 analyze groups of accesses. */
2439 static bool
2440 vect_analyze_data_ref_access (struct data_reference *dr)
2442 tree step = DR_STEP (dr);
2443 tree scalar_type = TREE_TYPE (DR_REF (dr));
2444 gimple *stmt = DR_STMT (dr);
2445 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2446 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2447 struct loop *loop = NULL;
2449 if (loop_vinfo)
2450 loop = LOOP_VINFO_LOOP (loop_vinfo);
2452 if (loop_vinfo && !step)
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2456 "bad data-ref access in loop\n");
2457 return false;
2460 /* Allow loads with zero step in inner-loop vectorization. */
2461 if (loop_vinfo && integer_zerop (step))
2463 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2464 if (!nested_in_vect_loop_p (loop, stmt))
2465 return DR_IS_READ (dr);
2466 /* Allow references with zero step for outer loops marked
2467 with pragma omp simd only - it guarantees absence of
2468 loop-carried dependencies between inner loop iterations. */
2469 if (!loop->force_vectorize)
2471 if (dump_enabled_p ())
2472 dump_printf_loc (MSG_NOTE, vect_location,
2473 "zero step in inner loop of nest\n");
2474 return false;
2478 if (loop && nested_in_vect_loop_p (loop, stmt))
2480 /* Interleaved accesses are not yet supported within outer-loop
2481 vectorization for references in the inner-loop. */
2482 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2484 /* For the rest of the analysis we use the outer-loop step. */
2485 step = STMT_VINFO_DR_STEP (stmt_info);
2486 if (integer_zerop (step))
2488 if (dump_enabled_p ())
2489 dump_printf_loc (MSG_NOTE, vect_location,
2490 "zero step in outer loop.\n");
2491 return DR_IS_READ (dr);
2495 /* Consecutive? */
2496 if (TREE_CODE (step) == INTEGER_CST)
2498 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2499 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2500 || (dr_step < 0
2501 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2503 /* Mark that it is not interleaving. */
2504 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2505 return true;
2509 if (loop && nested_in_vect_loop_p (loop, stmt))
2511 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "grouped access in outer loop.\n");
2514 return false;
2518 /* Assume this is a DR handled by non-constant strided load case. */
2519 if (TREE_CODE (step) != INTEGER_CST)
2520 return (STMT_VINFO_STRIDED_P (stmt_info)
2521 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2522 || vect_analyze_group_access (dr)));
2524 /* Not consecutive access - check if it's a part of interleaving group. */
2525 return vect_analyze_group_access (dr);
2530 /* A helper function used in the comparator function to sort data
2531 references. T1 and T2 are two data references to be compared.
2532 The function returns -1, 0, or 1. */
2534 static int
2535 compare_tree (tree t1, tree t2)
2537 int i, cmp;
2538 enum tree_code code;
2539 char tclass;
2541 if (t1 == t2)
2542 return 0;
2543 if (t1 == NULL)
2544 return -1;
2545 if (t2 == NULL)
2546 return 1;
2549 if (TREE_CODE (t1) != TREE_CODE (t2))
2550 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2552 code = TREE_CODE (t1);
2553 switch (code)
2555 /* For const values, we can just use hash values for comparisons. */
2556 case INTEGER_CST:
2557 case REAL_CST:
2558 case FIXED_CST:
2559 case STRING_CST:
2560 case COMPLEX_CST:
2561 case VECTOR_CST:
2563 hashval_t h1 = iterative_hash_expr (t1, 0);
2564 hashval_t h2 = iterative_hash_expr (t2, 0);
2565 if (h1 != h2)
2566 return h1 < h2 ? -1 : 1;
2567 break;
2570 case SSA_NAME:
2571 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2572 if (cmp != 0)
2573 return cmp;
2575 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2576 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2577 break;
2579 default:
2580 tclass = TREE_CODE_CLASS (code);
2582 /* For var-decl, we could compare their UIDs. */
2583 if (tclass == tcc_declaration)
2585 if (DECL_UID (t1) != DECL_UID (t2))
2586 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2587 break;
2590 /* For expressions with operands, compare their operands recursively. */
2591 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2593 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2594 if (cmp != 0)
2595 return cmp;
2599 return 0;
2603 /* Compare two data-references DRA and DRB to group them into chunks
2604 suitable for grouping. */
2606 static int
2607 dr_group_sort_cmp (const void *dra_, const void *drb_)
2609 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2610 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2611 int cmp;
2613 /* Stabilize sort. */
2614 if (dra == drb)
2615 return 0;
2617 /* Ordering of DRs according to base. */
2618 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2620 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2621 if (cmp != 0)
2622 return cmp;
2625 /* And according to DR_OFFSET. */
2626 if (!dr_equal_offsets_p (dra, drb))
2628 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2629 if (cmp != 0)
2630 return cmp;
2633 /* Put reads before writes. */
2634 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2635 return DR_IS_READ (dra) ? -1 : 1;
2637 /* Then sort after access size. */
2638 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2639 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2641 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2642 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2643 if (cmp != 0)
2644 return cmp;
2647 /* And after step. */
2648 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2650 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2651 if (cmp != 0)
2652 return cmp;
2655 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2656 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2657 if (cmp == 0)
2658 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2659 return cmp;
2662 /* Function vect_analyze_data_ref_accesses.
2664 Analyze the access pattern of all the data references in the loop.
2666 FORNOW: the only access pattern that is considered vectorizable is a
2667 simple step 1 (consecutive) access.
2669 FORNOW: handle only arrays and pointer accesses. */
2671 bool
2672 vect_analyze_data_ref_accesses (vec_info *vinfo)
2674 unsigned int i;
2675 vec<data_reference_p> datarefs = vinfo->datarefs;
2676 struct data_reference *dr;
2678 if (dump_enabled_p ())
2679 dump_printf_loc (MSG_NOTE, vect_location,
2680 "=== vect_analyze_data_ref_accesses ===\n");
2682 if (datarefs.is_empty ())
2683 return true;
2685 /* Sort the array of datarefs to make building the interleaving chains
2686 linear. Don't modify the original vector's order, it is needed for
2687 determining what dependencies are reversed. */
2688 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2689 datarefs_copy.qsort (dr_group_sort_cmp);
2691 /* Build the interleaving chains. */
2692 for (i = 0; i < datarefs_copy.length () - 1;)
2694 data_reference_p dra = datarefs_copy[i];
2695 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2696 stmt_vec_info lastinfo = NULL;
2697 for (i = i + 1; i < datarefs_copy.length (); ++i)
2699 data_reference_p drb = datarefs_copy[i];
2700 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2702 /* ??? Imperfect sorting (non-compatible types, non-modulo
2703 accesses, same accesses) can lead to a group to be artificially
2704 split here as we don't just skip over those. If it really
2705 matters we can push those to a worklist and re-iterate
2706 over them. The we can just skip ahead to the next DR here. */
2708 /* Check that the data-refs have same first location (except init)
2709 and they are both either store or load (not load and store,
2710 not masked loads or stores). */
2711 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2712 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2713 DR_BASE_ADDRESS (drb), 0)
2714 || !dr_equal_offsets_p (dra, drb)
2715 || !gimple_assign_single_p (DR_STMT (dra))
2716 || !gimple_assign_single_p (DR_STMT (drb)))
2717 break;
2719 /* Check that the data-refs have the same constant size. */
2720 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2721 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2722 if (!tree_fits_uhwi_p (sza)
2723 || !tree_fits_uhwi_p (szb)
2724 || !tree_int_cst_equal (sza, szb))
2725 break;
2727 /* Check that the data-refs have the same step. */
2728 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2729 break;
2731 /* Do not place the same access in the interleaving chain twice. */
2732 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2733 break;
2735 /* Check the types are compatible.
2736 ??? We don't distinguish this during sorting. */
2737 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2738 TREE_TYPE (DR_REF (drb))))
2739 break;
2741 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2742 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2743 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2744 gcc_assert (init_a < init_b);
2746 /* If init_b == init_a + the size of the type * k, we have an
2747 interleaving, and DRA is accessed before DRB. */
2748 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2749 if ((init_b - init_a) % type_size_a != 0)
2750 break;
2752 /* If we have a store, the accesses are adjacent. This splits
2753 groups into chunks we support (we don't support vectorization
2754 of stores with gaps). */
2755 if (!DR_IS_READ (dra)
2756 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2757 (DR_INIT (datarefs_copy[i-1]))
2758 != type_size_a))
2759 break;
2761 /* If the step (if not zero or non-constant) is greater than the
2762 difference between data-refs' inits this splits groups into
2763 suitable sizes. */
2764 if (tree_fits_shwi_p (DR_STEP (dra)))
2766 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2767 if (step != 0 && step <= (init_b - init_a))
2768 break;
2771 if (dump_enabled_p ())
2773 dump_printf_loc (MSG_NOTE, vect_location,
2774 "Detected interleaving ");
2775 if (DR_IS_READ (dra))
2776 dump_printf (MSG_NOTE, "load ");
2777 else
2778 dump_printf (MSG_NOTE, "store ");
2779 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2780 dump_printf (MSG_NOTE, " and ");
2781 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2782 dump_printf (MSG_NOTE, "\n");
2785 /* Link the found element into the group list. */
2786 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2788 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2789 lastinfo = stmtinfo_a;
2791 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2792 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2793 lastinfo = stmtinfo_b;
2797 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2798 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2799 && !vect_analyze_data_ref_access (dr))
2801 if (dump_enabled_p ())
2802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2803 "not vectorized: complicated access pattern.\n");
2805 if (is_a <bb_vec_info> (vinfo))
2807 /* Mark the statement as not vectorizable. */
2808 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2809 continue;
2811 else
2813 datarefs_copy.release ();
2814 return false;
2818 datarefs_copy.release ();
2819 return true;
2823 /* Operator == between two dr_with_seg_len objects.
2825 This equality operator is used to make sure two data refs
2826 are the same one so that we will consider to combine the
2827 aliasing checks of those two pairs of data dependent data
2828 refs. */
2830 static bool
2831 operator == (const dr_with_seg_len& d1,
2832 const dr_with_seg_len& d2)
2834 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2835 DR_BASE_ADDRESS (d2.dr), 0)
2836 && compare_tree (d1.offset, d2.offset) == 0
2837 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2840 /* Function comp_dr_with_seg_len_pair.
2842 Comparison function for sorting objects of dr_with_seg_len_pair_t
2843 so that we can combine aliasing checks in one scan. */
2845 static int
2846 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2848 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2849 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2851 const dr_with_seg_len &p11 = p1->first,
2852 &p12 = p1->second,
2853 &p21 = p2->first,
2854 &p22 = p2->second;
2856 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2857 if a and c have the same basic address snd step, and b and d have the same
2858 address and step. Therefore, if any a&c or b&d don't have the same address
2859 and step, we don't care the order of those two pairs after sorting. */
2860 int comp_res;
2862 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2863 DR_BASE_ADDRESS (p21.dr))) != 0)
2864 return comp_res;
2865 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2866 DR_BASE_ADDRESS (p22.dr))) != 0)
2867 return comp_res;
2868 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2869 return comp_res;
2870 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2871 return comp_res;
2872 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2873 return comp_res;
2874 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2875 return comp_res;
2877 return 0;
2880 /* Function vect_vfa_segment_size.
2882 Create an expression that computes the size of segment
2883 that will be accessed for a data reference. The functions takes into
2884 account that realignment loads may access one more vector.
2886 Input:
2887 DR: The data reference.
2888 LENGTH_FACTOR: segment length to consider.
2890 Return an expression whose value is the size of segment which will be
2891 accessed by DR. */
2893 static tree
2894 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2896 tree segment_length;
2898 if (integer_zerop (DR_STEP (dr)))
2899 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2900 else
2901 segment_length = size_binop (MULT_EXPR,
2902 fold_convert (sizetype, DR_STEP (dr)),
2903 fold_convert (sizetype, length_factor));
2905 if (vect_supportable_dr_alignment (dr, false)
2906 == dr_explicit_realign_optimized)
2908 tree vector_size = TYPE_SIZE_UNIT
2909 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2911 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2913 return segment_length;
2916 /* Function vect_prune_runtime_alias_test_list.
2918 Prune a list of ddrs to be tested at run-time by versioning for alias.
2919 Merge several alias checks into one if possible.
2920 Return FALSE if resulting list of ddrs is longer then allowed by
2921 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2923 bool
2924 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2926 vec<ddr_p> may_alias_ddrs =
2927 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2928 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2929 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2930 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2931 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2933 ddr_p ddr;
2934 unsigned int i;
2935 tree length_factor;
2937 if (dump_enabled_p ())
2938 dump_printf_loc (MSG_NOTE, vect_location,
2939 "=== vect_prune_runtime_alias_test_list ===\n");
2941 if (may_alias_ddrs.is_empty ())
2942 return true;
2944 /* Basically, for each pair of dependent data refs store_ptr_0
2945 and load_ptr_0, we create an expression:
2947 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2948 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2950 for aliasing checks. However, in some cases we can decrease
2951 the number of checks by combining two checks into one. For
2952 example, suppose we have another pair of data refs store_ptr_0
2953 and load_ptr_1, and if the following condition is satisfied:
2955 load_ptr_0 < load_ptr_1 &&
2956 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2958 (this condition means, in each iteration of vectorized loop,
2959 the accessed memory of store_ptr_0 cannot be between the memory
2960 of load_ptr_0 and load_ptr_1.)
2962 we then can use only the following expression to finish the
2963 alising checks between store_ptr_0 & load_ptr_0 and
2964 store_ptr_0 & load_ptr_1:
2966 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2967 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2969 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2970 same basic address. */
2972 comp_alias_ddrs.create (may_alias_ddrs.length ());
2974 /* First, we collect all data ref pairs for aliasing checks. */
2975 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2977 struct data_reference *dr_a, *dr_b;
2978 gimple *dr_group_first_a, *dr_group_first_b;
2979 tree segment_length_a, segment_length_b;
2980 gimple *stmt_a, *stmt_b;
2982 dr_a = DDR_A (ddr);
2983 stmt_a = DR_STMT (DDR_A (ddr));
2984 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2985 if (dr_group_first_a)
2987 stmt_a = dr_group_first_a;
2988 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2991 dr_b = DDR_B (ddr);
2992 stmt_b = DR_STMT (DDR_B (ddr));
2993 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2994 if (dr_group_first_b)
2996 stmt_b = dr_group_first_b;
2997 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3000 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3001 length_factor = scalar_loop_iters;
3002 else
3003 length_factor = size_int (vect_factor);
3004 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3005 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3007 dr_with_seg_len_pair_t dr_with_seg_len_pair
3008 (dr_with_seg_len (dr_a, segment_length_a),
3009 dr_with_seg_len (dr_b, segment_length_b));
3011 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3012 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3014 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3017 /* Second, we sort the collected data ref pairs so that we can scan
3018 them once to combine all possible aliasing checks. */
3019 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3021 /* Third, we scan the sorted dr pairs and check if we can combine
3022 alias checks of two neighbouring dr pairs. */
3023 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3025 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3026 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3027 *dr_b1 = &comp_alias_ddrs[i-1].second,
3028 *dr_a2 = &comp_alias_ddrs[i].first,
3029 *dr_b2 = &comp_alias_ddrs[i].second;
3031 /* Remove duplicate data ref pairs. */
3032 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3034 if (dump_enabled_p ())
3036 dump_printf_loc (MSG_NOTE, vect_location,
3037 "found equal ranges ");
3038 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3039 DR_REF (dr_a1->dr));
3040 dump_printf (MSG_NOTE, ", ");
3041 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3042 DR_REF (dr_b1->dr));
3043 dump_printf (MSG_NOTE, " and ");
3044 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3045 DR_REF (dr_a2->dr));
3046 dump_printf (MSG_NOTE, ", ");
3047 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3048 DR_REF (dr_b2->dr));
3049 dump_printf (MSG_NOTE, "\n");
3052 comp_alias_ddrs.ordered_remove (i--);
3053 continue;
3056 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3058 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3059 and DR_A1 and DR_A2 are two consecutive memrefs. */
3060 if (*dr_a1 == *dr_a2)
3062 std::swap (dr_a1, dr_b1);
3063 std::swap (dr_a2, dr_b2);
3066 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3067 DR_BASE_ADDRESS (dr_a2->dr),
3069 || !tree_fits_shwi_p (dr_a1->offset)
3070 || !tree_fits_shwi_p (dr_a2->offset))
3071 continue;
3073 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
3074 - tree_to_shwi (dr_a1->offset));
3077 /* Now we check if the following condition is satisfied:
3079 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3081 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3082 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3083 have to make a best estimation. We can get the minimum value
3084 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3085 then either of the following two conditions can guarantee the
3086 one above:
3088 1: DIFF <= MIN_SEG_LEN_B
3089 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3093 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
3094 ? tree_to_shwi (dr_b1->seg_len)
3095 : vect_factor);
3097 if (diff <= min_seg_len_b
3098 || (tree_fits_shwi_p (dr_a1->seg_len)
3099 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
3101 if (dump_enabled_p ())
3103 dump_printf_loc (MSG_NOTE, vect_location,
3104 "merging ranges for ");
3105 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3106 DR_REF (dr_a1->dr));
3107 dump_printf (MSG_NOTE, ", ");
3108 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3109 DR_REF (dr_b1->dr));
3110 dump_printf (MSG_NOTE, " and ");
3111 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3112 DR_REF (dr_a2->dr));
3113 dump_printf (MSG_NOTE, ", ");
3114 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3115 DR_REF (dr_b2->dr));
3116 dump_printf (MSG_NOTE, "\n");
3119 dr_a1->seg_len = size_binop (PLUS_EXPR,
3120 dr_a2->seg_len, size_int (diff));
3121 comp_alias_ddrs.ordered_remove (i--);
3126 dump_printf_loc (MSG_NOTE, vect_location,
3127 "improved number of alias checks from %d to %d\n",
3128 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3129 if ((int) comp_alias_ddrs.length () >
3130 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3131 return false;
3133 return true;
3136 /* Check whether a non-affine read or write in stmt is suitable for gather load
3137 or scatter store and if so, return a builtin decl for that operation. */
3139 tree
3140 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3141 tree *offp, int *scalep)
3143 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3144 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3145 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3146 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3147 tree offtype = NULL_TREE;
3148 tree decl, base, off;
3149 machine_mode pmode;
3150 int punsignedp, reversep, pvolatilep = 0;
3152 base = DR_REF (dr);
3153 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3154 see if we can use the def stmt of the address. */
3155 if (is_gimple_call (stmt)
3156 && gimple_call_internal_p (stmt)
3157 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3158 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3159 && TREE_CODE (base) == MEM_REF
3160 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3161 && integer_zerop (TREE_OPERAND (base, 1))
3162 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3164 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3165 if (is_gimple_assign (def_stmt)
3166 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3167 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3170 /* The gather and scatter builtins need address of the form
3171 loop_invariant + vector * {1, 2, 4, 8}
3173 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3174 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3175 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3176 multiplications and additions in it. To get a vector, we need
3177 a single SSA_NAME that will be defined in the loop and will
3178 contain everything that is not loop invariant and that can be
3179 vectorized. The following code attempts to find such a preexistng
3180 SSA_NAME OFF and put the loop invariants into a tree BASE
3181 that can be gimplified before the loop. */
3182 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3183 &punsignedp, &reversep, &pvolatilep, false);
3184 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3186 if (TREE_CODE (base) == MEM_REF)
3188 if (!integer_zerop (TREE_OPERAND (base, 1)))
3190 if (off == NULL_TREE)
3192 offset_int moff = mem_ref_offset (base);
3193 off = wide_int_to_tree (sizetype, moff);
3195 else
3196 off = size_binop (PLUS_EXPR, off,
3197 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3199 base = TREE_OPERAND (base, 0);
3201 else
3202 base = build_fold_addr_expr (base);
3204 if (off == NULL_TREE)
3205 off = size_zero_node;
3207 /* If base is not loop invariant, either off is 0, then we start with just
3208 the constant offset in the loop invariant BASE and continue with base
3209 as OFF, otherwise give up.
3210 We could handle that case by gimplifying the addition of base + off
3211 into some SSA_NAME and use that as off, but for now punt. */
3212 if (!expr_invariant_in_loop_p (loop, base))
3214 if (!integer_zerop (off))
3215 return NULL_TREE;
3216 off = base;
3217 base = size_int (pbitpos / BITS_PER_UNIT);
3219 /* Otherwise put base + constant offset into the loop invariant BASE
3220 and continue with OFF. */
3221 else
3223 base = fold_convert (sizetype, base);
3224 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3227 /* OFF at this point may be either a SSA_NAME or some tree expression
3228 from get_inner_reference. Try to peel off loop invariants from it
3229 into BASE as long as possible. */
3230 STRIP_NOPS (off);
3231 while (offtype == NULL_TREE)
3233 enum tree_code code;
3234 tree op0, op1, add = NULL_TREE;
3236 if (TREE_CODE (off) == SSA_NAME)
3238 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3240 if (expr_invariant_in_loop_p (loop, off))
3241 return NULL_TREE;
3243 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3244 break;
3246 op0 = gimple_assign_rhs1 (def_stmt);
3247 code = gimple_assign_rhs_code (def_stmt);
3248 op1 = gimple_assign_rhs2 (def_stmt);
3250 else
3252 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3253 return NULL_TREE;
3254 code = TREE_CODE (off);
3255 extract_ops_from_tree (off, &code, &op0, &op1);
3257 switch (code)
3259 case POINTER_PLUS_EXPR:
3260 case PLUS_EXPR:
3261 if (expr_invariant_in_loop_p (loop, op0))
3263 add = op0;
3264 off = op1;
3265 do_add:
3266 add = fold_convert (sizetype, add);
3267 if (scale != 1)
3268 add = size_binop (MULT_EXPR, add, size_int (scale));
3269 base = size_binop (PLUS_EXPR, base, add);
3270 continue;
3272 if (expr_invariant_in_loop_p (loop, op1))
3274 add = op1;
3275 off = op0;
3276 goto do_add;
3278 break;
3279 case MINUS_EXPR:
3280 if (expr_invariant_in_loop_p (loop, op1))
3282 add = fold_convert (sizetype, op1);
3283 add = size_binop (MINUS_EXPR, size_zero_node, add);
3284 off = op0;
3285 goto do_add;
3287 break;
3288 case MULT_EXPR:
3289 if (scale == 1 && tree_fits_shwi_p (op1))
3291 scale = tree_to_shwi (op1);
3292 off = op0;
3293 continue;
3295 break;
3296 case SSA_NAME:
3297 off = op0;
3298 continue;
3299 CASE_CONVERT:
3300 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3301 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3302 break;
3303 if (TYPE_PRECISION (TREE_TYPE (op0))
3304 == TYPE_PRECISION (TREE_TYPE (off)))
3306 off = op0;
3307 continue;
3309 if (TYPE_PRECISION (TREE_TYPE (op0))
3310 < TYPE_PRECISION (TREE_TYPE (off)))
3312 off = op0;
3313 offtype = TREE_TYPE (off);
3314 STRIP_NOPS (off);
3315 continue;
3317 break;
3318 default:
3319 break;
3321 break;
3324 /* If at the end OFF still isn't a SSA_NAME or isn't
3325 defined in the loop, punt. */
3326 if (TREE_CODE (off) != SSA_NAME
3327 || expr_invariant_in_loop_p (loop, off))
3328 return NULL_TREE;
3330 if (offtype == NULL_TREE)
3331 offtype = TREE_TYPE (off);
3333 if (DR_IS_READ (dr))
3334 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3335 offtype, scale);
3336 else
3337 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3338 offtype, scale);
3340 if (decl == NULL_TREE)
3341 return NULL_TREE;
3343 if (basep)
3344 *basep = base;
3345 if (offp)
3346 *offp = off;
3347 if (scalep)
3348 *scalep = scale;
3349 return decl;
3352 /* Function vect_analyze_data_refs.
3354 Find all the data references in the loop or basic block.
3356 The general structure of the analysis of data refs in the vectorizer is as
3357 follows:
3358 1- vect_analyze_data_refs(loop/bb): call
3359 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3360 in the loop/bb and their dependences.
3361 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3362 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3363 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3367 bool
3368 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3370 struct loop *loop = NULL;
3371 unsigned int i;
3372 struct data_reference *dr;
3373 tree scalar_type;
3375 if (dump_enabled_p ())
3376 dump_printf_loc (MSG_NOTE, vect_location,
3377 "=== vect_analyze_data_refs ===\n");
3379 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3380 loop = LOOP_VINFO_LOOP (loop_vinfo);
3382 /* Go through the data-refs, check that the analysis succeeded. Update
3383 pointer from stmt_vec_info struct to DR and vectype. */
3385 vec<data_reference_p> datarefs = vinfo->datarefs;
3386 FOR_EACH_VEC_ELT (datarefs, i, dr)
3388 gimple *stmt;
3389 stmt_vec_info stmt_info;
3390 tree base, offset, init;
3391 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3392 bool simd_lane_access = false;
3393 int vf;
3395 again:
3396 if (!dr || !DR_REF (dr))
3398 if (dump_enabled_p ())
3399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3400 "not vectorized: unhandled data-ref\n");
3401 return false;
3404 stmt = DR_STMT (dr);
3405 stmt_info = vinfo_for_stmt (stmt);
3407 /* Discard clobbers from the dataref vector. We will remove
3408 clobber stmts during vectorization. */
3409 if (gimple_clobber_p (stmt))
3411 free_data_ref (dr);
3412 if (i == datarefs.length () - 1)
3414 datarefs.pop ();
3415 break;
3417 datarefs.ordered_remove (i);
3418 dr = datarefs[i];
3419 goto again;
3422 /* Check that analysis of the data-ref succeeded. */
3423 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3424 || !DR_STEP (dr))
3426 bool maybe_gather
3427 = DR_IS_READ (dr)
3428 && !TREE_THIS_VOLATILE (DR_REF (dr))
3429 && targetm.vectorize.builtin_gather != NULL;
3430 bool maybe_scatter
3431 = DR_IS_WRITE (dr)
3432 && !TREE_THIS_VOLATILE (DR_REF (dr))
3433 && targetm.vectorize.builtin_scatter != NULL;
3434 bool maybe_simd_lane_access
3435 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3437 /* If target supports vector gather loads or scatter stores, or if
3438 this might be a SIMD lane access, see if they can't be used. */
3439 if (is_a <loop_vec_info> (vinfo)
3440 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3441 && !nested_in_vect_loop_p (loop, stmt))
3443 struct data_reference *newdr
3444 = create_data_ref (NULL, loop_containing_stmt (stmt),
3445 DR_REF (dr), stmt, maybe_scatter ? false : true);
3446 gcc_assert (newdr != NULL && DR_REF (newdr));
3447 if (DR_BASE_ADDRESS (newdr)
3448 && DR_OFFSET (newdr)
3449 && DR_INIT (newdr)
3450 && DR_STEP (newdr)
3451 && integer_zerop (DR_STEP (newdr)))
3453 if (maybe_simd_lane_access)
3455 tree off = DR_OFFSET (newdr);
3456 STRIP_NOPS (off);
3457 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3458 && TREE_CODE (off) == MULT_EXPR
3459 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3461 tree step = TREE_OPERAND (off, 1);
3462 off = TREE_OPERAND (off, 0);
3463 STRIP_NOPS (off);
3464 if (CONVERT_EXPR_P (off)
3465 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3466 0)))
3467 < TYPE_PRECISION (TREE_TYPE (off)))
3468 off = TREE_OPERAND (off, 0);
3469 if (TREE_CODE (off) == SSA_NAME)
3471 gimple *def = SSA_NAME_DEF_STMT (off);
3472 tree reft = TREE_TYPE (DR_REF (newdr));
3473 if (is_gimple_call (def)
3474 && gimple_call_internal_p (def)
3475 && (gimple_call_internal_fn (def)
3476 == IFN_GOMP_SIMD_LANE))
3478 tree arg = gimple_call_arg (def, 0);
3479 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3480 arg = SSA_NAME_VAR (arg);
3481 if (arg == loop->simduid
3482 /* For now. */
3483 && tree_int_cst_equal
3484 (TYPE_SIZE_UNIT (reft),
3485 step))
3487 DR_OFFSET (newdr) = ssize_int (0);
3488 DR_STEP (newdr) = step;
3489 DR_ALIGNED_TO (newdr)
3490 = size_int (BIGGEST_ALIGNMENT);
3491 dr = newdr;
3492 simd_lane_access = true;
3498 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3500 dr = newdr;
3501 if (maybe_gather)
3502 gatherscatter = GATHER;
3503 else
3504 gatherscatter = SCATTER;
3507 if (gatherscatter == SG_NONE && !simd_lane_access)
3508 free_data_ref (newdr);
3511 if (gatherscatter == SG_NONE && !simd_lane_access)
3513 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3516 "not vectorized: data ref analysis "
3517 "failed ");
3518 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3519 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3522 if (is_a <bb_vec_info> (vinfo))
3523 break;
3525 return false;
3529 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3531 if (dump_enabled_p ())
3532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3533 "not vectorized: base addr of dr is a "
3534 "constant\n");
3536 if (is_a <bb_vec_info> (vinfo))
3537 break;
3539 if (gatherscatter != SG_NONE || simd_lane_access)
3540 free_data_ref (dr);
3541 return false;
3544 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3546 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3549 "not vectorized: volatile type ");
3550 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3551 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3554 if (is_a <bb_vec_info> (vinfo))
3555 break;
3557 return false;
3560 if (stmt_can_throw_internal (stmt))
3562 if (dump_enabled_p ())
3564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3565 "not vectorized: statement can throw an "
3566 "exception ");
3567 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3568 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3571 if (is_a <bb_vec_info> (vinfo))
3572 break;
3574 if (gatherscatter != SG_NONE || simd_lane_access)
3575 free_data_ref (dr);
3576 return false;
3579 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3580 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3582 if (dump_enabled_p ())
3584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3585 "not vectorized: statement is bitfield "
3586 "access ");
3587 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3588 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3591 if (is_a <bb_vec_info> (vinfo))
3592 break;
3594 if (gatherscatter != SG_NONE || simd_lane_access)
3595 free_data_ref (dr);
3596 return false;
3599 base = unshare_expr (DR_BASE_ADDRESS (dr));
3600 offset = unshare_expr (DR_OFFSET (dr));
3601 init = unshare_expr (DR_INIT (dr));
3603 if (is_gimple_call (stmt)
3604 && (!gimple_call_internal_p (stmt)
3605 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3606 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3608 if (dump_enabled_p ())
3610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3611 "not vectorized: dr in a call ");
3612 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3613 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3616 if (is_a <bb_vec_info> (vinfo))
3617 break;
3619 if (gatherscatter != SG_NONE || simd_lane_access)
3620 free_data_ref (dr);
3621 return false;
3624 /* Update DR field in stmt_vec_info struct. */
3626 /* If the dataref is in an inner-loop of the loop that is considered for
3627 for vectorization, we also want to analyze the access relative to
3628 the outer-loop (DR contains information only relative to the
3629 inner-most enclosing loop). We do that by building a reference to the
3630 first location accessed by the inner-loop, and analyze it relative to
3631 the outer-loop. */
3632 if (loop && nested_in_vect_loop_p (loop, stmt))
3634 tree outer_step, outer_base, outer_init;
3635 HOST_WIDE_INT pbitsize, pbitpos;
3636 tree poffset;
3637 machine_mode pmode;
3638 int punsignedp, preversep, pvolatilep;
3639 affine_iv base_iv, offset_iv;
3640 tree dinit;
3642 /* Build a reference to the first location accessed by the
3643 inner-loop: *(BASE+INIT). (The first location is actually
3644 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3645 tree inner_base = build_fold_indirect_ref
3646 (fold_build_pointer_plus (base, init));
3648 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_NOTE, vect_location,
3651 "analyze in outer-loop: ");
3652 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3653 dump_printf (MSG_NOTE, "\n");
3656 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3657 &poffset, &pmode, &punsignedp,
3658 &preversep, &pvolatilep, false);
3659 gcc_assert (outer_base != NULL_TREE);
3661 if (pbitpos % BITS_PER_UNIT != 0)
3663 if (dump_enabled_p ())
3664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3665 "failed: bit offset alignment.\n");
3666 return false;
3669 if (preversep)
3671 if (dump_enabled_p ())
3672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3673 "failed: reverse storage order.\n");
3674 return false;
3677 outer_base = build_fold_addr_expr (outer_base);
3678 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3679 &base_iv, false))
3681 if (dump_enabled_p ())
3682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3683 "failed: evolution of base is not affine.\n");
3684 return false;
3687 if (offset)
3689 if (poffset)
3690 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3691 poffset);
3692 else
3693 poffset = offset;
3696 if (!poffset)
3698 offset_iv.base = ssize_int (0);
3699 offset_iv.step = ssize_int (0);
3701 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3702 &offset_iv, false))
3704 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706 "evolution of offset is not affine.\n");
3707 return false;
3710 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3711 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3712 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3713 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3714 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3716 outer_step = size_binop (PLUS_EXPR,
3717 fold_convert (ssizetype, base_iv.step),
3718 fold_convert (ssizetype, offset_iv.step));
3720 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3721 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3722 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3723 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3724 STMT_VINFO_DR_OFFSET (stmt_info) =
3725 fold_convert (ssizetype, offset_iv.base);
3726 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3727 size_int (highest_pow2_factor (offset_iv.base));
3729 if (dump_enabled_p ())
3731 dump_printf_loc (MSG_NOTE, vect_location,
3732 "\touter base_address: ");
3733 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3734 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3735 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3736 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3737 STMT_VINFO_DR_OFFSET (stmt_info));
3738 dump_printf (MSG_NOTE,
3739 "\n\touter constant offset from base address: ");
3740 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3741 STMT_VINFO_DR_INIT (stmt_info));
3742 dump_printf (MSG_NOTE, "\n\touter step: ");
3743 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3744 STMT_VINFO_DR_STEP (stmt_info));
3745 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3746 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3747 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3748 dump_printf (MSG_NOTE, "\n");
3752 if (STMT_VINFO_DATA_REF (stmt_info))
3754 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757 "not vectorized: more than one data ref "
3758 "in stmt: ");
3759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3760 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3763 if (is_a <bb_vec_info> (vinfo))
3764 break;
3766 if (gatherscatter != SG_NONE || simd_lane_access)
3767 free_data_ref (dr);
3768 return false;
3771 STMT_VINFO_DATA_REF (stmt_info) = dr;
3772 if (simd_lane_access)
3774 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3775 free_data_ref (datarefs[i]);
3776 datarefs[i] = dr;
3779 /* Set vectype for STMT. */
3780 scalar_type = TREE_TYPE (DR_REF (dr));
3781 STMT_VINFO_VECTYPE (stmt_info)
3782 = get_vectype_for_scalar_type (scalar_type);
3783 if (!STMT_VINFO_VECTYPE (stmt_info))
3785 if (dump_enabled_p ())
3787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3788 "not vectorized: no vectype for stmt: ");
3789 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3790 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3791 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3792 scalar_type);
3793 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3796 if (is_a <bb_vec_info> (vinfo))
3798 /* No vector type is fine, the ref can still participate
3799 in dependence analysis, we just can't vectorize it. */
3800 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3801 continue;
3804 if (gatherscatter != SG_NONE || simd_lane_access)
3806 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3807 if (gatherscatter != SG_NONE)
3808 free_data_ref (dr);
3810 return false;
3812 else
3814 if (dump_enabled_p ())
3816 dump_printf_loc (MSG_NOTE, vect_location,
3817 "got vectype for stmt: ");
3818 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3819 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3820 STMT_VINFO_VECTYPE (stmt_info));
3821 dump_printf (MSG_NOTE, "\n");
3825 /* Adjust the minimal vectorization factor according to the
3826 vector type. */
3827 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3828 if (vf > *min_vf)
3829 *min_vf = vf;
3831 if (gatherscatter != SG_NONE)
3833 tree off;
3834 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3835 NULL, &off, NULL)
3836 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3838 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3839 free_data_ref (dr);
3840 if (dump_enabled_p ())
3842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3843 (gatherscatter == GATHER) ?
3844 "not vectorized: not suitable for gather "
3845 "load " :
3846 "not vectorized: not suitable for scatter "
3847 "store ");
3848 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3849 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3851 return false;
3854 datarefs[i] = dr;
3855 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3858 else if (is_a <loop_vec_info> (vinfo)
3859 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3861 if (nested_in_vect_loop_p (loop, stmt))
3863 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3866 "not vectorized: not suitable for strided "
3867 "load ");
3868 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3869 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3871 return false;
3873 STMT_VINFO_STRIDED_P (stmt_info) = true;
3877 /* If we stopped analysis at the first dataref we could not analyze
3878 when trying to vectorize a basic-block mark the rest of the datarefs
3879 as not vectorizable and truncate the vector of datarefs. That
3880 avoids spending useless time in analyzing their dependence. */
3881 if (i != datarefs.length ())
3883 gcc_assert (is_a <bb_vec_info> (vinfo));
3884 for (unsigned j = i; j < datarefs.length (); ++j)
3886 data_reference_p dr = datarefs[j];
3887 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3888 free_data_ref (dr);
3890 datarefs.truncate (i);
3893 return true;
3897 /* Function vect_get_new_vect_var.
3899 Returns a name for a new variable. The current naming scheme appends the
3900 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3901 the name of vectorizer generated variables, and appends that to NAME if
3902 provided. */
3904 tree
3905 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3907 const char *prefix;
3908 tree new_vect_var;
3910 switch (var_kind)
3912 case vect_simple_var:
3913 prefix = "vect";
3914 break;
3915 case vect_scalar_var:
3916 prefix = "stmp";
3917 break;
3918 case vect_mask_var:
3919 prefix = "mask";
3920 break;
3921 case vect_pointer_var:
3922 prefix = "vectp";
3923 break;
3924 default:
3925 gcc_unreachable ();
3928 if (name)
3930 char* tmp = concat (prefix, "_", name, NULL);
3931 new_vect_var = create_tmp_reg (type, tmp);
3932 free (tmp);
3934 else
3935 new_vect_var = create_tmp_reg (type, prefix);
3937 return new_vect_var;
3940 /* Like vect_get_new_vect_var but return an SSA name. */
3942 tree
3943 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3945 const char *prefix;
3946 tree new_vect_var;
3948 switch (var_kind)
3950 case vect_simple_var:
3951 prefix = "vect";
3952 break;
3953 case vect_scalar_var:
3954 prefix = "stmp";
3955 break;
3956 case vect_pointer_var:
3957 prefix = "vectp";
3958 break;
3959 default:
3960 gcc_unreachable ();
3963 if (name)
3965 char* tmp = concat (prefix, "_", name, NULL);
3966 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3967 free (tmp);
3969 else
3970 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3972 return new_vect_var;
3975 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3977 static void
3978 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3979 stmt_vec_info stmt_info)
3981 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3982 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3983 int misalign = DR_MISALIGNMENT (dr);
3984 if (misalign == -1)
3985 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3986 else
3987 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3990 /* Function vect_create_addr_base_for_vector_ref.
3992 Create an expression that computes the address of the first memory location
3993 that will be accessed for a data reference.
3995 Input:
3996 STMT: The statement containing the data reference.
3997 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3998 OFFSET: Optional. If supplied, it is be added to the initial address.
3999 LOOP: Specify relative to which loop-nest should the address be computed.
4000 For example, when the dataref is in an inner-loop nested in an
4001 outer-loop that is now being vectorized, LOOP can be either the
4002 outer-loop, or the inner-loop. The first memory location accessed
4003 by the following dataref ('in' points to short):
4005 for (i=0; i<N; i++)
4006 for (j=0; j<M; j++)
4007 s += in[i+j]
4009 is as follows:
4010 if LOOP=i_loop: &in (relative to i_loop)
4011 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4012 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4013 initial address. Unlike OFFSET, which is number of elements to
4014 be added, BYTE_OFFSET is measured in bytes.
4016 Output:
4017 1. Return an SSA_NAME whose value is the address of the memory location of
4018 the first vector of the data reference.
4019 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4020 these statement(s) which define the returned SSA_NAME.
4022 FORNOW: We are only handling array accesses with step 1. */
4024 tree
4025 vect_create_addr_base_for_vector_ref (gimple *stmt,
4026 gimple_seq *new_stmt_list,
4027 tree offset,
4028 struct loop *loop,
4029 tree byte_offset)
4031 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4032 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4033 tree data_ref_base;
4034 const char *base_name;
4035 tree addr_base;
4036 tree dest;
4037 gimple_seq seq = NULL;
4038 tree base_offset;
4039 tree init;
4040 tree vect_ptr_type;
4041 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4042 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4044 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4046 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4048 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4050 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4051 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4052 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4054 else
4056 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4057 base_offset = unshare_expr (DR_OFFSET (dr));
4058 init = unshare_expr (DR_INIT (dr));
4061 if (loop_vinfo)
4062 base_name = get_name (data_ref_base);
4063 else
4065 base_offset = ssize_int (0);
4066 init = ssize_int (0);
4067 base_name = get_name (DR_REF (dr));
4070 /* Create base_offset */
4071 base_offset = size_binop (PLUS_EXPR,
4072 fold_convert (sizetype, base_offset),
4073 fold_convert (sizetype, init));
4075 if (offset)
4077 offset = fold_build2 (MULT_EXPR, sizetype,
4078 fold_convert (sizetype, offset), step);
4079 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4080 base_offset, offset);
4082 if (byte_offset)
4084 byte_offset = fold_convert (sizetype, byte_offset);
4085 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4086 base_offset, byte_offset);
4089 /* base + base_offset */
4090 if (loop_vinfo)
4091 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4092 else
4094 addr_base = build1 (ADDR_EXPR,
4095 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4096 unshare_expr (DR_REF (dr)));
4099 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4100 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4101 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4102 gimple_seq_add_seq (new_stmt_list, seq);
4104 if (DR_PTR_INFO (dr)
4105 && TREE_CODE (addr_base) == SSA_NAME
4106 && !SSA_NAME_PTR_INFO (addr_base))
4108 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4109 if (offset || byte_offset)
4110 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4113 if (dump_enabled_p ())
4115 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4116 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4117 dump_printf (MSG_NOTE, "\n");
4120 return addr_base;
4124 /* Function vect_create_data_ref_ptr.
4126 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4127 location accessed in the loop by STMT, along with the def-use update
4128 chain to appropriately advance the pointer through the loop iterations.
4129 Also set aliasing information for the pointer. This pointer is used by
4130 the callers to this function to create a memory reference expression for
4131 vector load/store access.
4133 Input:
4134 1. STMT: a stmt that references memory. Expected to be of the form
4135 GIMPLE_ASSIGN <name, data-ref> or
4136 GIMPLE_ASSIGN <data-ref, name>.
4137 2. AGGR_TYPE: the type of the reference, which should be either a vector
4138 or an array.
4139 3. AT_LOOP: the loop where the vector memref is to be created.
4140 4. OFFSET (optional): an offset to be added to the initial address accessed
4141 by the data-ref in STMT.
4142 5. BSI: location where the new stmts are to be placed if there is no loop
4143 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4144 pointing to the initial address.
4145 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4146 to the initial address accessed by the data-ref in STMT. This is
4147 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4148 in bytes.
4150 Output:
4151 1. Declare a new ptr to vector_type, and have it point to the base of the
4152 data reference (initial addressed accessed by the data reference).
4153 For example, for vector of type V8HI, the following code is generated:
4155 v8hi *ap;
4156 ap = (v8hi *)initial_address;
4158 if OFFSET is not supplied:
4159 initial_address = &a[init];
4160 if OFFSET is supplied:
4161 initial_address = &a[init + OFFSET];
4162 if BYTE_OFFSET is supplied:
4163 initial_address = &a[init] + BYTE_OFFSET;
4165 Return the initial_address in INITIAL_ADDRESS.
4167 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4168 update the pointer in each iteration of the loop.
4170 Return the increment stmt that updates the pointer in PTR_INCR.
4172 3. Set INV_P to true if the access pattern of the data reference in the
4173 vectorized loop is invariant. Set it to false otherwise.
4175 4. Return the pointer. */
4177 tree
4178 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4179 tree offset, tree *initial_address,
4180 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4181 bool only_init, bool *inv_p, tree byte_offset)
4183 const char *base_name;
4184 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4185 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4186 struct loop *loop = NULL;
4187 bool nested_in_vect_loop = false;
4188 struct loop *containing_loop = NULL;
4189 tree aggr_ptr_type;
4190 tree aggr_ptr;
4191 tree new_temp;
4192 gimple_seq new_stmt_list = NULL;
4193 edge pe = NULL;
4194 basic_block new_bb;
4195 tree aggr_ptr_init;
4196 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4197 tree aptr;
4198 gimple_stmt_iterator incr_gsi;
4199 bool insert_after;
4200 tree indx_before_incr, indx_after_incr;
4201 gimple *incr;
4202 tree step;
4203 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4205 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4206 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4208 if (loop_vinfo)
4210 loop = LOOP_VINFO_LOOP (loop_vinfo);
4211 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4212 containing_loop = (gimple_bb (stmt))->loop_father;
4213 pe = loop_preheader_edge (loop);
4215 else
4217 gcc_assert (bb_vinfo);
4218 only_init = true;
4219 *ptr_incr = NULL;
4222 /* Check the step (evolution) of the load in LOOP, and record
4223 whether it's invariant. */
4224 if (nested_in_vect_loop)
4225 step = STMT_VINFO_DR_STEP (stmt_info);
4226 else
4227 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4229 if (integer_zerop (step))
4230 *inv_p = true;
4231 else
4232 *inv_p = false;
4234 /* Create an expression for the first address accessed by this load
4235 in LOOP. */
4236 base_name = get_name (DR_BASE_ADDRESS (dr));
4238 if (dump_enabled_p ())
4240 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4241 dump_printf_loc (MSG_NOTE, vect_location,
4242 "create %s-pointer variable to type: ",
4243 get_tree_code_name (TREE_CODE (aggr_type)));
4244 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4245 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4246 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4247 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4248 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4249 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4250 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4251 else
4252 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4253 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4254 dump_printf (MSG_NOTE, "\n");
4257 /* (1) Create the new aggregate-pointer variable.
4258 Vector and array types inherit the alias set of their component
4259 type by default so we need to use a ref-all pointer if the data
4260 reference does not conflict with the created aggregated data
4261 reference because it is not addressable. */
4262 bool need_ref_all = false;
4263 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4264 get_alias_set (DR_REF (dr))))
4265 need_ref_all = true;
4266 /* Likewise for any of the data references in the stmt group. */
4267 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4269 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4272 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4273 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4274 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4275 get_alias_set (DR_REF (sdr))))
4277 need_ref_all = true;
4278 break;
4280 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4282 while (orig_stmt);
4284 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4285 need_ref_all);
4286 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4289 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4290 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4291 def-use update cycles for the pointer: one relative to the outer-loop
4292 (LOOP), which is what steps (3) and (4) below do. The other is relative
4293 to the inner-loop (which is the inner-most loop containing the dataref),
4294 and this is done be step (5) below.
4296 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4297 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4298 redundant. Steps (3),(4) create the following:
4300 vp0 = &base_addr;
4301 LOOP: vp1 = phi(vp0,vp2)
4304 vp2 = vp1 + step
4305 goto LOOP
4307 If there is an inner-loop nested in loop, then step (5) will also be
4308 applied, and an additional update in the inner-loop will be created:
4310 vp0 = &base_addr;
4311 LOOP: vp1 = phi(vp0,vp2)
4313 inner: vp3 = phi(vp1,vp4)
4314 vp4 = vp3 + inner_step
4315 if () goto inner
4317 vp2 = vp1 + step
4318 if () goto LOOP */
4320 /* (2) Calculate the initial address of the aggregate-pointer, and set
4321 the aggregate-pointer to point to it before the loop. */
4323 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4325 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4326 offset, loop, byte_offset);
4327 if (new_stmt_list)
4329 if (pe)
4331 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4332 gcc_assert (!new_bb);
4334 else
4335 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4338 *initial_address = new_temp;
4339 aggr_ptr_init = new_temp;
4341 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4342 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4343 inner-loop nested in LOOP (during outer-loop vectorization). */
4345 /* No update in loop is required. */
4346 if (only_init && (!loop_vinfo || at_loop == loop))
4347 aptr = aggr_ptr_init;
4348 else
4350 /* The step of the aggregate pointer is the type size. */
4351 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4352 /* One exception to the above is when the scalar step of the load in
4353 LOOP is zero. In this case the step here is also zero. */
4354 if (*inv_p)
4355 iv_step = size_zero_node;
4356 else if (tree_int_cst_sgn (step) == -1)
4357 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4359 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4361 create_iv (aggr_ptr_init,
4362 fold_convert (aggr_ptr_type, iv_step),
4363 aggr_ptr, loop, &incr_gsi, insert_after,
4364 &indx_before_incr, &indx_after_incr);
4365 incr = gsi_stmt (incr_gsi);
4366 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4368 /* Copy the points-to information if it exists. */
4369 if (DR_PTR_INFO (dr))
4371 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4372 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4374 if (ptr_incr)
4375 *ptr_incr = incr;
4377 aptr = indx_before_incr;
4380 if (!nested_in_vect_loop || only_init)
4381 return aptr;
4384 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4385 nested in LOOP, if exists. */
4387 gcc_assert (nested_in_vect_loop);
4388 if (!only_init)
4390 standard_iv_increment_position (containing_loop, &incr_gsi,
4391 &insert_after);
4392 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4393 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4394 &indx_after_incr);
4395 incr = gsi_stmt (incr_gsi);
4396 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4398 /* Copy the points-to information if it exists. */
4399 if (DR_PTR_INFO (dr))
4401 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4402 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4404 if (ptr_incr)
4405 *ptr_incr = incr;
4407 return indx_before_incr;
4409 else
4410 gcc_unreachable ();
4414 /* Function bump_vector_ptr
4416 Increment a pointer (to a vector type) by vector-size. If requested,
4417 i.e. if PTR-INCR is given, then also connect the new increment stmt
4418 to the existing def-use update-chain of the pointer, by modifying
4419 the PTR_INCR as illustrated below:
4421 The pointer def-use update-chain before this function:
4422 DATAREF_PTR = phi (p_0, p_2)
4423 ....
4424 PTR_INCR: p_2 = DATAREF_PTR + step
4426 The pointer def-use update-chain after this function:
4427 DATAREF_PTR = phi (p_0, p_2)
4428 ....
4429 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4430 ....
4431 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4433 Input:
4434 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4435 in the loop.
4436 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4437 the loop. The increment amount across iterations is expected
4438 to be vector_size.
4439 BSI - location where the new update stmt is to be placed.
4440 STMT - the original scalar memory-access stmt that is being vectorized.
4441 BUMP - optional. The offset by which to bump the pointer. If not given,
4442 the offset is assumed to be vector_size.
4444 Output: Return NEW_DATAREF_PTR as illustrated above.
4448 tree
4449 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4450 gimple *stmt, tree bump)
4452 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4453 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4454 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4455 tree update = TYPE_SIZE_UNIT (vectype);
4456 gassign *incr_stmt;
4457 ssa_op_iter iter;
4458 use_operand_p use_p;
4459 tree new_dataref_ptr;
4461 if (bump)
4462 update = bump;
4464 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4465 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4466 else
4467 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4468 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4469 dataref_ptr, update);
4470 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4472 /* Copy the points-to information if it exists. */
4473 if (DR_PTR_INFO (dr))
4475 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4476 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4479 if (!ptr_incr)
4480 return new_dataref_ptr;
4482 /* Update the vector-pointer's cross-iteration increment. */
4483 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4485 tree use = USE_FROM_PTR (use_p);
4487 if (use == dataref_ptr)
4488 SET_USE (use_p, new_dataref_ptr);
4489 else
4490 gcc_assert (tree_int_cst_compare (use, update) == 0);
4493 return new_dataref_ptr;
4497 /* Function vect_create_destination_var.
4499 Create a new temporary of type VECTYPE. */
4501 tree
4502 vect_create_destination_var (tree scalar_dest, tree vectype)
4504 tree vec_dest;
4505 const char *name;
4506 char *new_name;
4507 tree type;
4508 enum vect_var_kind kind;
4510 kind = vectype
4511 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4512 ? vect_mask_var
4513 : vect_simple_var
4514 : vect_scalar_var;
4515 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4517 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4519 name = get_name (scalar_dest);
4520 if (name)
4521 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4522 else
4523 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4524 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4525 free (new_name);
4527 return vec_dest;
4530 /* Function vect_grouped_store_supported.
4532 Returns TRUE if interleave high and interleave low permutations
4533 are supported, and FALSE otherwise. */
4535 bool
4536 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4538 machine_mode mode = TYPE_MODE (vectype);
4540 /* vect_permute_store_chain requires the group size to be equal to 3 or
4541 be a power of two. */
4542 if (count != 3 && exact_log2 (count) == -1)
4544 if (dump_enabled_p ())
4545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4546 "the size of the group of accesses"
4547 " is not a power of 2 or not eqaul to 3\n");
4548 return false;
4551 /* Check that the permutation is supported. */
4552 if (VECTOR_MODE_P (mode))
4554 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4555 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4557 if (count == 3)
4559 unsigned int j0 = 0, j1 = 0, j2 = 0;
4560 unsigned int i, j;
4562 for (j = 0; j < 3; j++)
4564 int nelt0 = ((3 - j) * nelt) % 3;
4565 int nelt1 = ((3 - j) * nelt + 1) % 3;
4566 int nelt2 = ((3 - j) * nelt + 2) % 3;
4567 for (i = 0; i < nelt; i++)
4569 if (3 * i + nelt0 < nelt)
4570 sel[3 * i + nelt0] = j0++;
4571 if (3 * i + nelt1 < nelt)
4572 sel[3 * i + nelt1] = nelt + j1++;
4573 if (3 * i + nelt2 < nelt)
4574 sel[3 * i + nelt2] = 0;
4576 if (!can_vec_perm_p (mode, false, sel))
4578 if (dump_enabled_p ())
4579 dump_printf (MSG_MISSED_OPTIMIZATION,
4580 "permutaion op not supported by target.\n");
4581 return false;
4584 for (i = 0; i < nelt; i++)
4586 if (3 * i + nelt0 < nelt)
4587 sel[3 * i + nelt0] = 3 * i + nelt0;
4588 if (3 * i + nelt1 < nelt)
4589 sel[3 * i + nelt1] = 3 * i + nelt1;
4590 if (3 * i + nelt2 < nelt)
4591 sel[3 * i + nelt2] = nelt + j2++;
4593 if (!can_vec_perm_p (mode, false, sel))
4595 if (dump_enabled_p ())
4596 dump_printf (MSG_MISSED_OPTIMIZATION,
4597 "permutaion op not supported by target.\n");
4598 return false;
4601 return true;
4603 else
4605 /* If length is not equal to 3 then only power of 2 is supported. */
4606 gcc_assert (exact_log2 (count) != -1);
4608 for (i = 0; i < nelt / 2; i++)
4610 sel[i * 2] = i;
4611 sel[i * 2 + 1] = i + nelt;
4613 if (can_vec_perm_p (mode, false, sel))
4615 for (i = 0; i < nelt; i++)
4616 sel[i] += nelt / 2;
4617 if (can_vec_perm_p (mode, false, sel))
4618 return true;
4623 if (dump_enabled_p ())
4624 dump_printf (MSG_MISSED_OPTIMIZATION,
4625 "permutaion op not supported by target.\n");
4626 return false;
4630 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4631 type VECTYPE. */
4633 bool
4634 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4636 return vect_lanes_optab_supported_p ("vec_store_lanes",
4637 vec_store_lanes_optab,
4638 vectype, count);
4642 /* Function vect_permute_store_chain.
4644 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4645 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4646 the data correctly for the stores. Return the final references for stores
4647 in RESULT_CHAIN.
4649 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4650 The input is 4 vectors each containing 8 elements. We assign a number to
4651 each element, the input sequence is:
4653 1st vec: 0 1 2 3 4 5 6 7
4654 2nd vec: 8 9 10 11 12 13 14 15
4655 3rd vec: 16 17 18 19 20 21 22 23
4656 4th vec: 24 25 26 27 28 29 30 31
4658 The output sequence should be:
4660 1st vec: 0 8 16 24 1 9 17 25
4661 2nd vec: 2 10 18 26 3 11 19 27
4662 3rd vec: 4 12 20 28 5 13 21 30
4663 4th vec: 6 14 22 30 7 15 23 31
4665 i.e., we interleave the contents of the four vectors in their order.
4667 We use interleave_high/low instructions to create such output. The input of
4668 each interleave_high/low operation is two vectors:
4669 1st vec 2nd vec
4670 0 1 2 3 4 5 6 7
4671 the even elements of the result vector are obtained left-to-right from the
4672 high/low elements of the first vector. The odd elements of the result are
4673 obtained left-to-right from the high/low elements of the second vector.
4674 The output of interleave_high will be: 0 4 1 5
4675 and of interleave_low: 2 6 3 7
4678 The permutation is done in log LENGTH stages. In each stage interleave_high
4679 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4680 where the first argument is taken from the first half of DR_CHAIN and the
4681 second argument from it's second half.
4682 In our example,
4684 I1: interleave_high (1st vec, 3rd vec)
4685 I2: interleave_low (1st vec, 3rd vec)
4686 I3: interleave_high (2nd vec, 4th vec)
4687 I4: interleave_low (2nd vec, 4th vec)
4689 The output for the first stage is:
4691 I1: 0 16 1 17 2 18 3 19
4692 I2: 4 20 5 21 6 22 7 23
4693 I3: 8 24 9 25 10 26 11 27
4694 I4: 12 28 13 29 14 30 15 31
4696 The output of the second stage, i.e. the final result is:
4698 I1: 0 8 16 24 1 9 17 25
4699 I2: 2 10 18 26 3 11 19 27
4700 I3: 4 12 20 28 5 13 21 30
4701 I4: 6 14 22 30 7 15 23 31. */
4703 void
4704 vect_permute_store_chain (vec<tree> dr_chain,
4705 unsigned int length,
4706 gimple *stmt,
4707 gimple_stmt_iterator *gsi,
4708 vec<tree> *result_chain)
4710 tree vect1, vect2, high, low;
4711 gimple *perm_stmt;
4712 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4713 tree perm_mask_low, perm_mask_high;
4714 tree data_ref;
4715 tree perm3_mask_low, perm3_mask_high;
4716 unsigned int i, n, log_length = exact_log2 (length);
4717 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4718 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4720 result_chain->quick_grow (length);
4721 memcpy (result_chain->address (), dr_chain.address (),
4722 length * sizeof (tree));
4724 if (length == 3)
4726 unsigned int j0 = 0, j1 = 0, j2 = 0;
4728 for (j = 0; j < 3; j++)
4730 int nelt0 = ((3 - j) * nelt) % 3;
4731 int nelt1 = ((3 - j) * nelt + 1) % 3;
4732 int nelt2 = ((3 - j) * nelt + 2) % 3;
4734 for (i = 0; i < nelt; i++)
4736 if (3 * i + nelt0 < nelt)
4737 sel[3 * i + nelt0] = j0++;
4738 if (3 * i + nelt1 < nelt)
4739 sel[3 * i + nelt1] = nelt + j1++;
4740 if (3 * i + nelt2 < nelt)
4741 sel[3 * i + nelt2] = 0;
4743 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4745 for (i = 0; i < nelt; i++)
4747 if (3 * i + nelt0 < nelt)
4748 sel[3 * i + nelt0] = 3 * i + nelt0;
4749 if (3 * i + nelt1 < nelt)
4750 sel[3 * i + nelt1] = 3 * i + nelt1;
4751 if (3 * i + nelt2 < nelt)
4752 sel[3 * i + nelt2] = nelt + j2++;
4754 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4756 vect1 = dr_chain[0];
4757 vect2 = dr_chain[1];
4759 /* Create interleaving stmt:
4760 low = VEC_PERM_EXPR <vect1, vect2,
4761 {j, nelt, *, j + 1, nelt + j + 1, *,
4762 j + 2, nelt + j + 2, *, ...}> */
4763 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4764 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4765 vect2, perm3_mask_low);
4766 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4768 vect1 = data_ref;
4769 vect2 = dr_chain[2];
4770 /* Create interleaving stmt:
4771 low = VEC_PERM_EXPR <vect1, vect2,
4772 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4773 6, 7, nelt + j + 2, ...}> */
4774 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4775 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4776 vect2, perm3_mask_high);
4777 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4778 (*result_chain)[j] = data_ref;
4781 else
4783 /* If length is not equal to 3 then only power of 2 is supported. */
4784 gcc_assert (exact_log2 (length) != -1);
4786 for (i = 0, n = nelt / 2; i < n; i++)
4788 sel[i * 2] = i;
4789 sel[i * 2 + 1] = i + nelt;
4791 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4793 for (i = 0; i < nelt; i++)
4794 sel[i] += nelt / 2;
4795 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4797 for (i = 0, n = log_length; i < n; i++)
4799 for (j = 0; j < length/2; j++)
4801 vect1 = dr_chain[j];
4802 vect2 = dr_chain[j+length/2];
4804 /* Create interleaving stmt:
4805 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4806 ...}> */
4807 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4808 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4809 vect2, perm_mask_high);
4810 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4811 (*result_chain)[2*j] = high;
4813 /* Create interleaving stmt:
4814 low = VEC_PERM_EXPR <vect1, vect2,
4815 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4816 ...}> */
4817 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4818 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4819 vect2, perm_mask_low);
4820 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4821 (*result_chain)[2*j+1] = low;
4823 memcpy (dr_chain.address (), result_chain->address (),
4824 length * sizeof (tree));
4829 /* Function vect_setup_realignment
4831 This function is called when vectorizing an unaligned load using
4832 the dr_explicit_realign[_optimized] scheme.
4833 This function generates the following code at the loop prolog:
4835 p = initial_addr;
4836 x msq_init = *(floor(p)); # prolog load
4837 realignment_token = call target_builtin;
4838 loop:
4839 x msq = phi (msq_init, ---)
4841 The stmts marked with x are generated only for the case of
4842 dr_explicit_realign_optimized.
4844 The code above sets up a new (vector) pointer, pointing to the first
4845 location accessed by STMT, and a "floor-aligned" load using that pointer.
4846 It also generates code to compute the "realignment-token" (if the relevant
4847 target hook was defined), and creates a phi-node at the loop-header bb
4848 whose arguments are the result of the prolog-load (created by this
4849 function) and the result of a load that takes place in the loop (to be
4850 created by the caller to this function).
4852 For the case of dr_explicit_realign_optimized:
4853 The caller to this function uses the phi-result (msq) to create the
4854 realignment code inside the loop, and sets up the missing phi argument,
4855 as follows:
4856 loop:
4857 msq = phi (msq_init, lsq)
4858 lsq = *(floor(p')); # load in loop
4859 result = realign_load (msq, lsq, realignment_token);
4861 For the case of dr_explicit_realign:
4862 loop:
4863 msq = *(floor(p)); # load in loop
4864 p' = p + (VS-1);
4865 lsq = *(floor(p')); # load in loop
4866 result = realign_load (msq, lsq, realignment_token);
4868 Input:
4869 STMT - (scalar) load stmt to be vectorized. This load accesses
4870 a memory location that may be unaligned.
4871 BSI - place where new code is to be inserted.
4872 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4873 is used.
4875 Output:
4876 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4877 target hook, if defined.
4878 Return value - the result of the loop-header phi node. */
4880 tree
4881 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4882 tree *realignment_token,
4883 enum dr_alignment_support alignment_support_scheme,
4884 tree init_addr,
4885 struct loop **at_loop)
4887 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4888 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4889 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4890 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4891 struct loop *loop = NULL;
4892 edge pe = NULL;
4893 tree scalar_dest = gimple_assign_lhs (stmt);
4894 tree vec_dest;
4895 gimple *inc;
4896 tree ptr;
4897 tree data_ref;
4898 basic_block new_bb;
4899 tree msq_init = NULL_TREE;
4900 tree new_temp;
4901 gphi *phi_stmt;
4902 tree msq = NULL_TREE;
4903 gimple_seq stmts = NULL;
4904 bool inv_p;
4905 bool compute_in_loop = false;
4906 bool nested_in_vect_loop = false;
4907 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4908 struct loop *loop_for_initial_load = NULL;
4910 if (loop_vinfo)
4912 loop = LOOP_VINFO_LOOP (loop_vinfo);
4913 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4916 gcc_assert (alignment_support_scheme == dr_explicit_realign
4917 || alignment_support_scheme == dr_explicit_realign_optimized);
4919 /* We need to generate three things:
4920 1. the misalignment computation
4921 2. the extra vector load (for the optimized realignment scheme).
4922 3. the phi node for the two vectors from which the realignment is
4923 done (for the optimized realignment scheme). */
4925 /* 1. Determine where to generate the misalignment computation.
4927 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4928 calculation will be generated by this function, outside the loop (in the
4929 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4930 caller, inside the loop.
4932 Background: If the misalignment remains fixed throughout the iterations of
4933 the loop, then both realignment schemes are applicable, and also the
4934 misalignment computation can be done outside LOOP. This is because we are
4935 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4936 are a multiple of VS (the Vector Size), and therefore the misalignment in
4937 different vectorized LOOP iterations is always the same.
4938 The problem arises only if the memory access is in an inner-loop nested
4939 inside LOOP, which is now being vectorized using outer-loop vectorization.
4940 This is the only case when the misalignment of the memory access may not
4941 remain fixed throughout the iterations of the inner-loop (as explained in
4942 detail in vect_supportable_dr_alignment). In this case, not only is the
4943 optimized realignment scheme not applicable, but also the misalignment
4944 computation (and generation of the realignment token that is passed to
4945 REALIGN_LOAD) have to be done inside the loop.
4947 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4948 or not, which in turn determines if the misalignment is computed inside
4949 the inner-loop, or outside LOOP. */
4951 if (init_addr != NULL_TREE || !loop_vinfo)
4953 compute_in_loop = true;
4954 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4958 /* 2. Determine where to generate the extra vector load.
4960 For the optimized realignment scheme, instead of generating two vector
4961 loads in each iteration, we generate a single extra vector load in the
4962 preheader of the loop, and in each iteration reuse the result of the
4963 vector load from the previous iteration. In case the memory access is in
4964 an inner-loop nested inside LOOP, which is now being vectorized using
4965 outer-loop vectorization, we need to determine whether this initial vector
4966 load should be generated at the preheader of the inner-loop, or can be
4967 generated at the preheader of LOOP. If the memory access has no evolution
4968 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4969 to be generated inside LOOP (in the preheader of the inner-loop). */
4971 if (nested_in_vect_loop)
4973 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4974 bool invariant_in_outerloop =
4975 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4976 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4978 else
4979 loop_for_initial_load = loop;
4980 if (at_loop)
4981 *at_loop = loop_for_initial_load;
4983 if (loop_for_initial_load)
4984 pe = loop_preheader_edge (loop_for_initial_load);
4986 /* 3. For the case of the optimized realignment, create the first vector
4987 load at the loop preheader. */
4989 if (alignment_support_scheme == dr_explicit_realign_optimized)
4991 /* Create msq_init = *(floor(p1)) in the loop preheader */
4992 gassign *new_stmt;
4994 gcc_assert (!compute_in_loop);
4995 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4996 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4997 NULL_TREE, &init_addr, NULL, &inc,
4998 true, &inv_p);
4999 if (TREE_CODE (ptr) == SSA_NAME)
5000 new_temp = copy_ssa_name (ptr);
5001 else
5002 new_temp = make_ssa_name (TREE_TYPE (ptr));
5003 new_stmt = gimple_build_assign
5004 (new_temp, BIT_AND_EXPR, ptr,
5005 build_int_cst (TREE_TYPE (ptr),
5006 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5007 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5008 gcc_assert (!new_bb);
5009 data_ref
5010 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5011 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5012 new_stmt = gimple_build_assign (vec_dest, data_ref);
5013 new_temp = make_ssa_name (vec_dest, new_stmt);
5014 gimple_assign_set_lhs (new_stmt, new_temp);
5015 if (pe)
5017 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5018 gcc_assert (!new_bb);
5020 else
5021 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5023 msq_init = gimple_assign_lhs (new_stmt);
5026 /* 4. Create realignment token using a target builtin, if available.
5027 It is done either inside the containing loop, or before LOOP (as
5028 determined above). */
5030 if (targetm.vectorize.builtin_mask_for_load)
5032 gcall *new_stmt;
5033 tree builtin_decl;
5035 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5036 if (!init_addr)
5038 /* Generate the INIT_ADDR computation outside LOOP. */
5039 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5040 NULL_TREE, loop);
5041 if (loop)
5043 pe = loop_preheader_edge (loop);
5044 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5045 gcc_assert (!new_bb);
5047 else
5048 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5051 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5052 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5053 vec_dest =
5054 vect_create_destination_var (scalar_dest,
5055 gimple_call_return_type (new_stmt));
5056 new_temp = make_ssa_name (vec_dest, new_stmt);
5057 gimple_call_set_lhs (new_stmt, new_temp);
5059 if (compute_in_loop)
5060 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5061 else
5063 /* Generate the misalignment computation outside LOOP. */
5064 pe = loop_preheader_edge (loop);
5065 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5066 gcc_assert (!new_bb);
5069 *realignment_token = gimple_call_lhs (new_stmt);
5071 /* The result of the CALL_EXPR to this builtin is determined from
5072 the value of the parameter and no global variables are touched
5073 which makes the builtin a "const" function. Requiring the
5074 builtin to have the "const" attribute makes it unnecessary
5075 to call mark_call_clobbered. */
5076 gcc_assert (TREE_READONLY (builtin_decl));
5079 if (alignment_support_scheme == dr_explicit_realign)
5080 return msq;
5082 gcc_assert (!compute_in_loop);
5083 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5086 /* 5. Create msq = phi <msq_init, lsq> in loop */
5088 pe = loop_preheader_edge (containing_loop);
5089 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5090 msq = make_ssa_name (vec_dest);
5091 phi_stmt = create_phi_node (msq, containing_loop->header);
5092 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5094 return msq;
5098 /* Function vect_grouped_load_supported.
5100 Returns TRUE if even and odd permutations are supported,
5101 and FALSE otherwise. */
5103 bool
5104 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5106 machine_mode mode = TYPE_MODE (vectype);
5108 /* vect_permute_load_chain requires the group size to be equal to 3 or
5109 be a power of two. */
5110 if (count != 3 && exact_log2 (count) == -1)
5112 if (dump_enabled_p ())
5113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5114 "the size of the group of accesses"
5115 " is not a power of 2 or not equal to 3\n");
5116 return false;
5119 /* Check that the permutation is supported. */
5120 if (VECTOR_MODE_P (mode))
5122 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5123 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5125 if (count == 3)
5127 unsigned int k;
5128 for (k = 0; k < 3; k++)
5130 for (i = 0; i < nelt; i++)
5131 if (3 * i + k < 2 * nelt)
5132 sel[i] = 3 * i + k;
5133 else
5134 sel[i] = 0;
5135 if (!can_vec_perm_p (mode, false, sel))
5137 if (dump_enabled_p ())
5138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5139 "shuffle of 3 loads is not supported by"
5140 " target\n");
5141 return false;
5143 for (i = 0, j = 0; i < nelt; i++)
5144 if (3 * i + k < 2 * nelt)
5145 sel[i] = i;
5146 else
5147 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5148 if (!can_vec_perm_p (mode, false, sel))
5150 if (dump_enabled_p ())
5151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5152 "shuffle of 3 loads is not supported by"
5153 " target\n");
5154 return false;
5157 return true;
5159 else
5161 /* If length is not equal to 3 then only power of 2 is supported. */
5162 gcc_assert (exact_log2 (count) != -1);
5163 for (i = 0; i < nelt; i++)
5164 sel[i] = i * 2;
5165 if (can_vec_perm_p (mode, false, sel))
5167 for (i = 0; i < nelt; i++)
5168 sel[i] = i * 2 + 1;
5169 if (can_vec_perm_p (mode, false, sel))
5170 return true;
5175 if (dump_enabled_p ())
5176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5177 "extract even/odd not supported by target\n");
5178 return false;
5181 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5182 type VECTYPE. */
5184 bool
5185 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5187 return vect_lanes_optab_supported_p ("vec_load_lanes",
5188 vec_load_lanes_optab,
5189 vectype, count);
5192 /* Function vect_permute_load_chain.
5194 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5195 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5196 the input data correctly. Return the final references for loads in
5197 RESULT_CHAIN.
5199 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5200 The input is 4 vectors each containing 8 elements. We assign a number to each
5201 element, the input sequence is:
5203 1st vec: 0 1 2 3 4 5 6 7
5204 2nd vec: 8 9 10 11 12 13 14 15
5205 3rd vec: 16 17 18 19 20 21 22 23
5206 4th vec: 24 25 26 27 28 29 30 31
5208 The output sequence should be:
5210 1st vec: 0 4 8 12 16 20 24 28
5211 2nd vec: 1 5 9 13 17 21 25 29
5212 3rd vec: 2 6 10 14 18 22 26 30
5213 4th vec: 3 7 11 15 19 23 27 31
5215 i.e., the first output vector should contain the first elements of each
5216 interleaving group, etc.
5218 We use extract_even/odd instructions to create such output. The input of
5219 each extract_even/odd operation is two vectors
5220 1st vec 2nd vec
5221 0 1 2 3 4 5 6 7
5223 and the output is the vector of extracted even/odd elements. The output of
5224 extract_even will be: 0 2 4 6
5225 and of extract_odd: 1 3 5 7
5228 The permutation is done in log LENGTH stages. In each stage extract_even
5229 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5230 their order. In our example,
5232 E1: extract_even (1st vec, 2nd vec)
5233 E2: extract_odd (1st vec, 2nd vec)
5234 E3: extract_even (3rd vec, 4th vec)
5235 E4: extract_odd (3rd vec, 4th vec)
5237 The output for the first stage will be:
5239 E1: 0 2 4 6 8 10 12 14
5240 E2: 1 3 5 7 9 11 13 15
5241 E3: 16 18 20 22 24 26 28 30
5242 E4: 17 19 21 23 25 27 29 31
5244 In order to proceed and create the correct sequence for the next stage (or
5245 for the correct output, if the second stage is the last one, as in our
5246 example), we first put the output of extract_even operation and then the
5247 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5248 The input for the second stage is:
5250 1st vec (E1): 0 2 4 6 8 10 12 14
5251 2nd vec (E3): 16 18 20 22 24 26 28 30
5252 3rd vec (E2): 1 3 5 7 9 11 13 15
5253 4th vec (E4): 17 19 21 23 25 27 29 31
5255 The output of the second stage:
5257 E1: 0 4 8 12 16 20 24 28
5258 E2: 2 6 10 14 18 22 26 30
5259 E3: 1 5 9 13 17 21 25 29
5260 E4: 3 7 11 15 19 23 27 31
5262 And RESULT_CHAIN after reordering:
5264 1st vec (E1): 0 4 8 12 16 20 24 28
5265 2nd vec (E3): 1 5 9 13 17 21 25 29
5266 3rd vec (E2): 2 6 10 14 18 22 26 30
5267 4th vec (E4): 3 7 11 15 19 23 27 31. */
5269 static void
5270 vect_permute_load_chain (vec<tree> dr_chain,
5271 unsigned int length,
5272 gimple *stmt,
5273 gimple_stmt_iterator *gsi,
5274 vec<tree> *result_chain)
5276 tree data_ref, first_vect, second_vect;
5277 tree perm_mask_even, perm_mask_odd;
5278 tree perm3_mask_low, perm3_mask_high;
5279 gimple *perm_stmt;
5280 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5281 unsigned int i, j, log_length = exact_log2 (length);
5282 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5283 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5285 result_chain->quick_grow (length);
5286 memcpy (result_chain->address (), dr_chain.address (),
5287 length * sizeof (tree));
5289 if (length == 3)
5291 unsigned int k;
5293 for (k = 0; k < 3; k++)
5295 for (i = 0; i < nelt; i++)
5296 if (3 * i + k < 2 * nelt)
5297 sel[i] = 3 * i + k;
5298 else
5299 sel[i] = 0;
5300 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5302 for (i = 0, j = 0; i < nelt; i++)
5303 if (3 * i + k < 2 * nelt)
5304 sel[i] = i;
5305 else
5306 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5308 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5310 first_vect = dr_chain[0];
5311 second_vect = dr_chain[1];
5313 /* Create interleaving stmt (low part of):
5314 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5315 ...}> */
5316 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5317 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5318 second_vect, perm3_mask_low);
5319 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5321 /* Create interleaving stmt (high part of):
5322 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5323 ...}> */
5324 first_vect = data_ref;
5325 second_vect = dr_chain[2];
5326 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5327 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5328 second_vect, perm3_mask_high);
5329 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5330 (*result_chain)[k] = data_ref;
5333 else
5335 /* If length is not equal to 3 then only power of 2 is supported. */
5336 gcc_assert (exact_log2 (length) != -1);
5338 for (i = 0; i < nelt; ++i)
5339 sel[i] = i * 2;
5340 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5342 for (i = 0; i < nelt; ++i)
5343 sel[i] = i * 2 + 1;
5344 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5346 for (i = 0; i < log_length; i++)
5348 for (j = 0; j < length; j += 2)
5350 first_vect = dr_chain[j];
5351 second_vect = dr_chain[j+1];
5353 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5354 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5355 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5356 first_vect, second_vect,
5357 perm_mask_even);
5358 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5359 (*result_chain)[j/2] = data_ref;
5361 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5362 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5363 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5364 first_vect, second_vect,
5365 perm_mask_odd);
5366 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5367 (*result_chain)[j/2+length/2] = data_ref;
5369 memcpy (dr_chain.address (), result_chain->address (),
5370 length * sizeof (tree));
5375 /* Function vect_shift_permute_load_chain.
5377 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5378 sequence of stmts to reorder the input data accordingly.
5379 Return the final references for loads in RESULT_CHAIN.
5380 Return true if successed, false otherwise.
5382 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5383 The input is 3 vectors each containing 8 elements. We assign a
5384 number to each element, the input sequence is:
5386 1st vec: 0 1 2 3 4 5 6 7
5387 2nd vec: 8 9 10 11 12 13 14 15
5388 3rd vec: 16 17 18 19 20 21 22 23
5390 The output sequence should be:
5392 1st vec: 0 3 6 9 12 15 18 21
5393 2nd vec: 1 4 7 10 13 16 19 22
5394 3rd vec: 2 5 8 11 14 17 20 23
5396 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5398 First we shuffle all 3 vectors to get correct elements order:
5400 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5401 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5402 3rd vec: (16 19 22) (17 20 23) (18 21)
5404 Next we unite and shift vector 3 times:
5406 1st step:
5407 shift right by 6 the concatenation of:
5408 "1st vec" and "2nd vec"
5409 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5410 "2nd vec" and "3rd vec"
5411 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5412 "3rd vec" and "1st vec"
5413 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5414 | New vectors |
5416 So that now new vectors are:
5418 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5419 2nd vec: (10 13) (16 19 22) (17 20 23)
5420 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5422 2nd step:
5423 shift right by 5 the concatenation of:
5424 "1st vec" and "3rd vec"
5425 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5426 "2nd vec" and "1st vec"
5427 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5428 "3rd vec" and "2nd vec"
5429 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5430 | New vectors |
5432 So that now new vectors are:
5434 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5435 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5436 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5438 3rd step:
5439 shift right by 5 the concatenation of:
5440 "1st vec" and "1st vec"
5441 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5442 shift right by 3 the concatenation of:
5443 "2nd vec" and "2nd vec"
5444 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5445 | New vectors |
5447 So that now all vectors are READY:
5448 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5449 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5450 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5452 This algorithm is faster than one in vect_permute_load_chain if:
5453 1. "shift of a concatination" is faster than general permutation.
5454 This is usually so.
5455 2. The TARGET machine can't execute vector instructions in parallel.
5456 This is because each step of the algorithm depends on previous.
5457 The algorithm in vect_permute_load_chain is much more parallel.
5459 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5462 static bool
5463 vect_shift_permute_load_chain (vec<tree> dr_chain,
5464 unsigned int length,
5465 gimple *stmt,
5466 gimple_stmt_iterator *gsi,
5467 vec<tree> *result_chain)
5469 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5470 tree perm2_mask1, perm2_mask2, perm3_mask;
5471 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5472 gimple *perm_stmt;
5474 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5475 unsigned int i;
5476 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5477 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5478 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5479 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5481 result_chain->quick_grow (length);
5482 memcpy (result_chain->address (), dr_chain.address (),
5483 length * sizeof (tree));
5485 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5487 unsigned int j, log_length = exact_log2 (length);
5488 for (i = 0; i < nelt / 2; ++i)
5489 sel[i] = i * 2;
5490 for (i = 0; i < nelt / 2; ++i)
5491 sel[nelt / 2 + i] = i * 2 + 1;
5492 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5494 if (dump_enabled_p ())
5495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5496 "shuffle of 2 fields structure is not \
5497 supported by target\n");
5498 return false;
5500 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5502 for (i = 0; i < nelt / 2; ++i)
5503 sel[i] = i * 2 + 1;
5504 for (i = 0; i < nelt / 2; ++i)
5505 sel[nelt / 2 + i] = i * 2;
5506 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5508 if (dump_enabled_p ())
5509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5510 "shuffle of 2 fields structure is not \
5511 supported by target\n");
5512 return false;
5514 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5516 /* Generating permutation constant to shift all elements.
5517 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5518 for (i = 0; i < nelt; i++)
5519 sel[i] = nelt / 2 + i;
5520 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5522 if (dump_enabled_p ())
5523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5524 "shift permutation is not supported by target\n");
5525 return false;
5527 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5529 /* Generating permutation constant to select vector from 2.
5530 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5531 for (i = 0; i < nelt / 2; i++)
5532 sel[i] = i;
5533 for (i = nelt / 2; i < nelt; i++)
5534 sel[i] = nelt + i;
5535 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5537 if (dump_enabled_p ())
5538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5539 "select is not supported by target\n");
5540 return false;
5542 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5544 for (i = 0; i < log_length; i++)
5546 for (j = 0; j < length; j += 2)
5548 first_vect = dr_chain[j];
5549 second_vect = dr_chain[j + 1];
5551 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5552 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5553 first_vect, first_vect,
5554 perm2_mask1);
5555 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5556 vect[0] = data_ref;
5558 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5559 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5560 second_vect, second_vect,
5561 perm2_mask2);
5562 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5563 vect[1] = data_ref;
5565 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5566 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5567 vect[0], vect[1], shift1_mask);
5568 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5569 (*result_chain)[j/2 + length/2] = data_ref;
5571 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5572 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5573 vect[0], vect[1], select_mask);
5574 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5575 (*result_chain)[j/2] = data_ref;
5577 memcpy (dr_chain.address (), result_chain->address (),
5578 length * sizeof (tree));
5580 return true;
5582 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5584 unsigned int k = 0, l = 0;
5586 /* Generating permutation constant to get all elements in rigth order.
5587 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5588 for (i = 0; i < nelt; i++)
5590 if (3 * k + (l % 3) >= nelt)
5592 k = 0;
5593 l += (3 - (nelt % 3));
5595 sel[i] = 3 * k + (l % 3);
5596 k++;
5598 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5600 if (dump_enabled_p ())
5601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5602 "shuffle of 3 fields structure is not \
5603 supported by target\n");
5604 return false;
5606 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5608 /* Generating permutation constant to shift all elements.
5609 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5610 for (i = 0; i < nelt; i++)
5611 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5612 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5614 if (dump_enabled_p ())
5615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5616 "shift permutation is not supported by target\n");
5617 return false;
5619 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5621 /* Generating permutation constant to shift all elements.
5622 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5623 for (i = 0; i < nelt; i++)
5624 sel[i] = 2 * (nelt / 3) + 1 + i;
5625 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5627 if (dump_enabled_p ())
5628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5629 "shift permutation is not supported by target\n");
5630 return false;
5632 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5634 /* Generating permutation constant to shift all elements.
5635 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5636 for (i = 0; i < nelt; i++)
5637 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5638 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5640 if (dump_enabled_p ())
5641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5642 "shift permutation is not supported by target\n");
5643 return false;
5645 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5647 /* Generating permutation constant to shift all elements.
5648 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5649 for (i = 0; i < nelt; i++)
5650 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5651 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5653 if (dump_enabled_p ())
5654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5655 "shift permutation is not supported by target\n");
5656 return false;
5658 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5660 for (k = 0; k < 3; k++)
5662 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5663 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5664 dr_chain[k], dr_chain[k],
5665 perm3_mask);
5666 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5667 vect[k] = data_ref;
5670 for (k = 0; k < 3; k++)
5672 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5673 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5674 vect[k % 3], vect[(k + 1) % 3],
5675 shift1_mask);
5676 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5677 vect_shift[k] = data_ref;
5680 for (k = 0; k < 3; k++)
5682 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5683 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5684 vect_shift[(4 - k) % 3],
5685 vect_shift[(3 - k) % 3],
5686 shift2_mask);
5687 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5688 vect[k] = data_ref;
5691 (*result_chain)[3 - (nelt % 3)] = vect[2];
5693 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5694 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5695 vect[0], shift3_mask);
5696 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5697 (*result_chain)[nelt % 3] = data_ref;
5699 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5700 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5701 vect[1], shift4_mask);
5702 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5703 (*result_chain)[0] = data_ref;
5704 return true;
5706 return false;
5709 /* Function vect_transform_grouped_load.
5711 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5712 to perform their permutation and ascribe the result vectorized statements to
5713 the scalar statements.
5716 void
5717 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5718 gimple_stmt_iterator *gsi)
5720 machine_mode mode;
5721 vec<tree> result_chain = vNULL;
5723 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5724 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5725 vectors, that are ready for vector computation. */
5726 result_chain.create (size);
5728 /* If reassociation width for vector type is 2 or greater target machine can
5729 execute 2 or more vector instructions in parallel. Otherwise try to
5730 get chain for loads group using vect_shift_permute_load_chain. */
5731 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5732 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5733 || exact_log2 (size) != -1
5734 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5735 gsi, &result_chain))
5736 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5737 vect_record_grouped_load_vectors (stmt, result_chain);
5738 result_chain.release ();
5741 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5742 generated as part of the vectorization of STMT. Assign the statement
5743 for each vector to the associated scalar statement. */
5745 void
5746 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5748 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5749 gimple *next_stmt, *new_stmt;
5750 unsigned int i, gap_count;
5751 tree tmp_data_ref;
5753 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5754 Since we scan the chain starting from it's first node, their order
5755 corresponds the order of data-refs in RESULT_CHAIN. */
5756 next_stmt = first_stmt;
5757 gap_count = 1;
5758 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5760 if (!next_stmt)
5761 break;
5763 /* Skip the gaps. Loads created for the gaps will be removed by dead
5764 code elimination pass later. No need to check for the first stmt in
5765 the group, since it always exists.
5766 GROUP_GAP is the number of steps in elements from the previous
5767 access (if there is no gap GROUP_GAP is 1). We skip loads that
5768 correspond to the gaps. */
5769 if (next_stmt != first_stmt
5770 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5772 gap_count++;
5773 continue;
5776 while (next_stmt)
5778 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5779 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5780 copies, and we put the new vector statement in the first available
5781 RELATED_STMT. */
5782 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5783 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5784 else
5786 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5788 gimple *prev_stmt =
5789 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5790 gimple *rel_stmt =
5791 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5792 while (rel_stmt)
5794 prev_stmt = rel_stmt;
5795 rel_stmt =
5796 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5799 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5800 new_stmt;
5804 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5805 gap_count = 1;
5806 /* If NEXT_STMT accesses the same DR as the previous statement,
5807 put the same TMP_DATA_REF as its vectorized statement; otherwise
5808 get the next data-ref from RESULT_CHAIN. */
5809 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5810 break;
5815 /* Function vect_force_dr_alignment_p.
5817 Returns whether the alignment of a DECL can be forced to be aligned
5818 on ALIGNMENT bit boundary. */
5820 bool
5821 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5823 if (TREE_CODE (decl) != VAR_DECL)
5824 return false;
5826 if (decl_in_symtab_p (decl)
5827 && !symtab_node::get (decl)->can_increase_alignment_p ())
5828 return false;
5830 if (TREE_STATIC (decl))
5831 return (alignment <= MAX_OFILE_ALIGNMENT);
5832 else
5833 return (alignment <= MAX_STACK_ALIGNMENT);
5837 /* Return whether the data reference DR is supported with respect to its
5838 alignment.
5839 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5840 it is aligned, i.e., check if it is possible to vectorize it with different
5841 alignment. */
5843 enum dr_alignment_support
5844 vect_supportable_dr_alignment (struct data_reference *dr,
5845 bool check_aligned_accesses)
5847 gimple *stmt = DR_STMT (dr);
5848 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5849 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5850 machine_mode mode = TYPE_MODE (vectype);
5851 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5852 struct loop *vect_loop = NULL;
5853 bool nested_in_vect_loop = false;
5855 if (aligned_access_p (dr) && !check_aligned_accesses)
5856 return dr_aligned;
5858 /* For now assume all conditional loads/stores support unaligned
5859 access without any special code. */
5860 if (is_gimple_call (stmt)
5861 && gimple_call_internal_p (stmt)
5862 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5863 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5864 return dr_unaligned_supported;
5866 if (loop_vinfo)
5868 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5869 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5872 /* Possibly unaligned access. */
5874 /* We can choose between using the implicit realignment scheme (generating
5875 a misaligned_move stmt) and the explicit realignment scheme (generating
5876 aligned loads with a REALIGN_LOAD). There are two variants to the
5877 explicit realignment scheme: optimized, and unoptimized.
5878 We can optimize the realignment only if the step between consecutive
5879 vector loads is equal to the vector size. Since the vector memory
5880 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5881 is guaranteed that the misalignment amount remains the same throughout the
5882 execution of the vectorized loop. Therefore, we can create the
5883 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5884 at the loop preheader.
5886 However, in the case of outer-loop vectorization, when vectorizing a
5887 memory access in the inner-loop nested within the LOOP that is now being
5888 vectorized, while it is guaranteed that the misalignment of the
5889 vectorized memory access will remain the same in different outer-loop
5890 iterations, it is *not* guaranteed that is will remain the same throughout
5891 the execution of the inner-loop. This is because the inner-loop advances
5892 with the original scalar step (and not in steps of VS). If the inner-loop
5893 step happens to be a multiple of VS, then the misalignment remains fixed
5894 and we can use the optimized realignment scheme. For example:
5896 for (i=0; i<N; i++)
5897 for (j=0; j<M; j++)
5898 s += a[i+j];
5900 When vectorizing the i-loop in the above example, the step between
5901 consecutive vector loads is 1, and so the misalignment does not remain
5902 fixed across the execution of the inner-loop, and the realignment cannot
5903 be optimized (as illustrated in the following pseudo vectorized loop):
5905 for (i=0; i<N; i+=4)
5906 for (j=0; j<M; j++){
5907 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5908 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5909 // (assuming that we start from an aligned address).
5912 We therefore have to use the unoptimized realignment scheme:
5914 for (i=0; i<N; i+=4)
5915 for (j=k; j<M; j+=4)
5916 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5917 // that the misalignment of the initial address is
5918 // 0).
5920 The loop can then be vectorized as follows:
5922 for (k=0; k<4; k++){
5923 rt = get_realignment_token (&vp[k]);
5924 for (i=0; i<N; i+=4){
5925 v1 = vp[i+k];
5926 for (j=k; j<M; j+=4){
5927 v2 = vp[i+j+VS-1];
5928 va = REALIGN_LOAD <v1,v2,rt>;
5929 vs += va;
5930 v1 = v2;
5933 } */
5935 if (DR_IS_READ (dr))
5937 bool is_packed = false;
5938 tree type = (TREE_TYPE (DR_REF (dr)));
5940 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5941 && (!targetm.vectorize.builtin_mask_for_load
5942 || targetm.vectorize.builtin_mask_for_load ()))
5944 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5945 if ((nested_in_vect_loop
5946 && (TREE_INT_CST_LOW (DR_STEP (dr))
5947 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5948 || !loop_vinfo)
5949 return dr_explicit_realign;
5950 else
5951 return dr_explicit_realign_optimized;
5953 if (!known_alignment_for_access_p (dr))
5954 is_packed = not_size_aligned (DR_REF (dr));
5956 if ((TYPE_USER_ALIGN (type) && !is_packed)
5957 || targetm.vectorize.
5958 support_vector_misalignment (mode, type,
5959 DR_MISALIGNMENT (dr), is_packed))
5960 /* Can't software pipeline the loads, but can at least do them. */
5961 return dr_unaligned_supported;
5963 else
5965 bool is_packed = false;
5966 tree type = (TREE_TYPE (DR_REF (dr)));
5968 if (!known_alignment_for_access_p (dr))
5969 is_packed = not_size_aligned (DR_REF (dr));
5971 if ((TYPE_USER_ALIGN (type) && !is_packed)
5972 || targetm.vectorize.
5973 support_vector_misalignment (mode, type,
5974 DR_MISALIGNMENT (dr), is_packed))
5975 return dr_unaligned_supported;
5978 /* Unsupported. */
5979 return dr_unaligned_unsupported;