* testsuite/26_numerics/headers/cmath/hypot.cc: XFAIL on AIX.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob5a303140833b711ee1c46564d85549f2ecdc9824
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE, vect_location,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158 dump_printf (MSG_NOTE, " and ");
159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160 dump_printf (MSG_NOTE, "\n");
163 if (optimize_loop_nest_for_size_p (loop))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167 "versioning not supported when optimizing"
168 " for size.\n");
169 return false;
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
173 if (loop->inner)
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177 "versioning not yet supported for outer-loops.\n");
178 return false;
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 step. */
183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188 "versioning not yet supported for non-constant "
189 "step\n");
190 return false;
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194 return true;
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
205 static bool
206 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
207 loop_vec_info loop_vinfo, int *max_vf)
209 unsigned int i;
210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211 struct data_reference *dra = DDR_A (ddr);
212 struct data_reference *drb = DDR_B (ddr);
213 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
214 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
215 lambda_vector dist_v;
216 unsigned int loop_depth;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
221 gcc_unreachable ();
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
225 return false;
227 if (dra == drb
228 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
229 return false;
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
235 return false;
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
244 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
246 get_alias_set (DR_REF (drb))))
247 return false;
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop->safelen >= 2)
256 if (loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 return false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
271 DR_REF (dra));
272 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (drb));
275 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
277 return true;
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (dra));
287 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (drb));
290 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop->safelen >= 2)
304 if (loop->safelen < *max_vf)
305 *max_vf = loop->safelen;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
307 return false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (dra));
320 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
322 DR_REF (drb));
323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
336 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
342 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
345 int dist = dist_v[loop_depth];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance = %d.\n", dist);
351 if (dist == 0)
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_NOTE, " and ");
359 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
366 .. = a[i];
367 .. = a[i+1];
368 a[i] = ..;
369 a[i+1] = ..;
370 *p = ..;
371 .. = a[i];
372 .. = a[i+1];
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
377 a[i] = ...;
378 ... = a[i];
379 a[i+1] = ...;
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
384 gimple *earlier_stmt;
385 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
386 if (DR_IS_WRITE
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "READ_WRITE dependence in interleaving."
392 "\n");
393 return true;
397 continue;
400 if (dist > 0 && DDR_REVERSED_P (ddr))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
411 if (DR_IS_READ (drb)
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
415 continue;
418 if (abs (dist) >= 2
419 && abs (dist) < *max_vf)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf = abs (dist);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "adjusting maximal vectorization factor to %i\n",
427 *max_vf);
430 if (abs (dist) >= *max_vf)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "dependence distance >= VF.\n");
437 continue;
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
446 dump_printf (MSG_NOTE, " and ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
448 dump_printf (MSG_NOTE, "\n");
451 return true;
454 return false;
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
463 bool
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
466 unsigned int i;
467 struct data_dependence_relation *ddr;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
479 &LOOP_VINFO_DDRS (loop_vinfo),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
481 return false;
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
487 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
488 else
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
490 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
491 return false;
493 return true;
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
504 static bool
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
507 struct data_reference *dra = DDR_A (ddr);
508 struct data_reference *drb = DDR_B (ddr);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
515 return false;
517 if (dra == drb)
518 return false;
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra) && DR_IS_READ (drb))
522 return false;
524 /* If dra and drb are part of the same interleaving chain consider
525 them independent. */
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
529 return false;
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE, vect_location,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
549 dump_printf (MSG_NOTE, " and ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
551 dump_printf (MSG_NOTE, "\n");
554 return true;
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
562 static bool
563 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
564 vec<gimple *> stores, gimple *last_store)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
568 group. */
569 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
570 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
572 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
573 if (access == last_access)
574 continue;
575 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
576 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
577 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
579 gimple *stmt = gsi_stmt (gsi);
580 if (! gimple_vuse (stmt)
581 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
582 continue;
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
589 if (!dr_b)
590 return false;
592 bool dependent = false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt))
599 if (stmt != last_store)
600 continue;
601 unsigned i;
602 gimple *store;
603 FOR_EACH_VEC_ELT (stores, i, store)
605 data_reference *store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
607 ddr_p ddr = initialize_data_dependence_relation
608 (dr_a, store_dr, vNULL);
609 dependent = vect_slp_analyze_data_ref_dependence (ddr);
610 free_dependence_relation (ddr);
611 if (dependent)
612 break;
615 else
617 ddr_p ddr = initialize_data_dependence_relation (dr_a,
618 dr_b, vNULL);
619 dependent = vect_slp_analyze_data_ref_dependence (ddr);
620 free_dependence_relation (ddr);
622 if (dependent)
623 return false;
626 return true;
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
636 bool
637 vect_slp_analyze_instance_dependence (slp_instance instance)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store = SLP_INSTANCE_TREE (instance);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
646 store = NULL;
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple *last_store = NULL;
650 if (store)
652 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
653 return false;
655 /* Mark stores in this instance and remember the last one. */
656 last_store = vect_find_last_scalar_stmt_in_slp (store);
657 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
661 bool res = true;
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
665 slp_tree load;
666 unsigned int i;
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
668 if (! vect_slp_analyze_node_dependences (instance, load,
669 store
670 ? SLP_TREE_SCALAR_STMTS (store)
671 : vNULL, last_store))
673 res = false;
674 break;
677 /* Unset the visited flag. */
678 if (store)
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
682 return res;
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
689 Output:
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
697 bool
698 vect_compute_data_ref_alignment (struct data_reference *dr)
700 gimple *stmt = DR_STMT (dr);
701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
702 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
703 struct loop *loop = NULL;
704 tree ref = DR_REF (dr);
705 tree vectype;
706 tree base, base_addr;
707 tree misalign = NULL_TREE;
708 tree aligned_to;
709 tree step;
710 unsigned HOST_WIDE_INT alignment;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "vect_compute_data_ref_alignment:\n");
716 if (loop_vinfo)
717 loop = LOOP_VINFO_LOOP (loop_vinfo);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr)))
723 misalign = DR_INIT (dr);
724 aligned_to = DR_ALIGNED_TO (dr);
725 base_addr = DR_BASE_ADDRESS (dr);
726 vectype = STMT_VINFO_VECTYPE (stmt_info);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop && nested_in_vect_loop_p (loop, stmt))
736 tree step = DR_STEP (dr);
738 if (tree_fits_shwi_p (step)
739 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location,
743 "inner step divides the vector-size.\n");
744 misalign = STMT_VINFO_DR_INIT (stmt_info);
745 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
746 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
748 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "inner step doesn't divide the vector-size.\n");
753 misalign = NULL_TREE;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
761 else
763 tree step = DR_STEP (dr);
764 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
766 if (tree_fits_shwi_p (step)
767 && ((tree_to_shwi (step) * vf)
768 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "step doesn't divide the vector-size.\n");
773 misalign = NULL_TREE;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
779 base = ref;
780 while (handled_component_p (base))
781 base = TREE_OPERAND (base, 0);
782 unsigned int base_alignment;
783 unsigned HOST_WIDE_INT base_bitpos;
784 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
788 DR_BASE_ADDRESS. */
789 if (TREE_CODE (base) == MEM_REF)
791 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
792 base_bitpos &= (base_alignment - 1);
794 if (base_bitpos != 0)
795 base_alignment = base_bitpos & -base_bitpos;
796 /* Also look at the alignment of the base address DR analysis
797 computed. */
798 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
799 if (base_addr_alignment > base_alignment)
800 base_alignment = base_addr_alignment;
802 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
803 DR_VECT_AUX (dr)->base_element_aligned = true;
805 alignment = TYPE_ALIGN_UNIT (vectype);
807 if ((compare_tree_int (aligned_to, alignment) < 0)
808 || !misalign)
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
813 "Unknown alignment for access: ");
814 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
815 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
817 return true;
820 if (base_alignment < TYPE_ALIGN (vectype))
822 base = base_addr;
823 if (TREE_CODE (base) == ADDR_EXPR)
824 base = TREE_OPERAND (base, 0);
825 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
827 if (dump_enabled_p ())
829 dump_printf_loc (MSG_NOTE, vect_location,
830 "can't force alignment of ref: ");
831 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
832 dump_printf (MSG_NOTE, "\n");
834 return true;
837 if (DECL_USER_ALIGN (base))
839 if (dump_enabled_p ())
841 dump_printf_loc (MSG_NOTE, vect_location,
842 "not forcing alignment of user-aligned "
843 "variable: ");
844 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
845 dump_printf (MSG_NOTE, "\n");
847 return true;
850 /* Force the alignment of the decl.
851 NOTE: This is the only change to the code we make during
852 the analysis phase, before deciding to vectorize the loop. */
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
856 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
857 dump_printf (MSG_NOTE, "\n");
860 DR_VECT_AUX (dr)->base_decl = base;
861 DR_VECT_AUX (dr)->base_misaligned = true;
862 DR_VECT_AUX (dr)->base_element_aligned = true;
865 if (loop && nested_in_vect_loop_p (loop, stmt))
866 step = STMT_VINFO_DR_STEP (stmt_info);
867 else
868 step = DR_STEP (dr);
869 /* If this is a backward running DR then first access in the larger
870 vectype actually is N-1 elements before the address in the DR.
871 Adjust misalign accordingly. */
872 if (tree_int_cst_sgn (step) < 0)
874 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
875 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
876 otherwise we wouldn't be here. */
877 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
878 /* PLUS because STEP was negative. */
879 misalign = size_binop (PLUS_EXPR, misalign, offset);
882 SET_DR_MISALIGNMENT (dr,
883 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
885 if (dump_enabled_p ())
887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
888 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
889 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
890 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
893 return true;
897 /* Function vect_update_misalignment_for_peel
899 DR - the data reference whose misalignment is to be adjusted.
900 DR_PEEL - the data reference whose misalignment is being made
901 zero in the vector loop by the peel.
902 NPEEL - the number of iterations in the peel loop if the misalignment
903 of DR_PEEL is known at compile time. */
905 static void
906 vect_update_misalignment_for_peel (struct data_reference *dr,
907 struct data_reference *dr_peel, int npeel)
909 unsigned int i;
910 vec<dr_p> same_align_drs;
911 struct data_reference *current_dr;
912 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
913 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
914 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
915 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
917 /* For interleaved data accesses the step in the loop must be multiplied by
918 the size of the interleaving group. */
919 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
920 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
921 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
922 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
924 /* It can be assumed that the data refs with the same alignment as dr_peel
925 are aligned in the vector loop. */
926 same_align_drs
927 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
928 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
930 if (current_dr != dr)
931 continue;
932 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
933 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
934 SET_DR_MISALIGNMENT (dr, 0);
935 return;
938 if (known_alignment_for_access_p (dr)
939 && known_alignment_for_access_p (dr_peel))
941 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
942 int misal = DR_MISALIGNMENT (dr);
943 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
944 misal += negative ? -npeel * dr_size : npeel * dr_size;
945 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
946 SET_DR_MISALIGNMENT (dr, misal);
947 return;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
952 SET_DR_MISALIGNMENT (dr, -1);
956 /* Function verify_data_ref_alignment
958 Return TRUE if DR can be handled with respect to alignment. */
960 static bool
961 verify_data_ref_alignment (data_reference_p dr)
963 enum dr_alignment_support supportable_dr_alignment
964 = vect_supportable_dr_alignment (dr, false);
965 if (!supportable_dr_alignment)
967 if (dump_enabled_p ())
969 if (DR_IS_READ (dr))
970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
971 "not vectorized: unsupported unaligned load.");
972 else
973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
974 "not vectorized: unsupported unaligned "
975 "store.");
977 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
978 DR_REF (dr));
979 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
981 return false;
984 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
985 dump_printf_loc (MSG_NOTE, vect_location,
986 "Vectorizing an unaligned access.\n");
988 return true;
991 /* Function vect_verify_datarefs_alignment
993 Return TRUE if all data references in the loop can be
994 handled with respect to alignment. */
996 bool
997 vect_verify_datarefs_alignment (loop_vec_info vinfo)
999 vec<data_reference_p> datarefs = vinfo->datarefs;
1000 struct data_reference *dr;
1001 unsigned int i;
1003 FOR_EACH_VEC_ELT (datarefs, i, dr)
1005 gimple *stmt = DR_STMT (dr);
1006 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1009 continue;
1011 /* For interleaving, only the alignment of the first access matters. */
1012 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1013 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1014 continue;
1016 /* Strided accesses perform only component accesses, alignment is
1017 irrelevant for them. */
1018 if (STMT_VINFO_STRIDED_P (stmt_info)
1019 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1020 continue;
1022 if (! verify_data_ref_alignment (dr))
1023 return false;
1026 return true;
1029 /* Given an memory reference EXP return whether its alignment is less
1030 than its size. */
1032 static bool
1033 not_size_aligned (tree exp)
1035 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1036 return true;
1038 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1039 > get_object_alignment (exp));
1042 /* Function vector_alignment_reachable_p
1044 Return true if vector alignment for DR is reachable by peeling
1045 a few loop iterations. Return false otherwise. */
1047 static bool
1048 vector_alignment_reachable_p (struct data_reference *dr)
1050 gimple *stmt = DR_STMT (dr);
1051 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1052 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1054 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1056 /* For interleaved access we peel only if number of iterations in
1057 the prolog loop ({VF - misalignment}), is a multiple of the
1058 number of the interleaved accesses. */
1059 int elem_size, mis_in_elements;
1060 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1062 /* FORNOW: handle only known alignment. */
1063 if (!known_alignment_for_access_p (dr))
1064 return false;
1066 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1067 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1069 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1070 return false;
1073 /* If misalignment is known at the compile time then allow peeling
1074 only if natural alignment is reachable through peeling. */
1075 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1077 HOST_WIDE_INT elmsize =
1078 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1079 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_NOTE, vect_location,
1082 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1083 dump_printf (MSG_NOTE,
1084 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1086 if (DR_MISALIGNMENT (dr) % elmsize)
1088 if (dump_enabled_p ())
1089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090 "data size does not divide the misalignment.\n");
1091 return false;
1095 if (!known_alignment_for_access_p (dr))
1097 tree type = TREE_TYPE (DR_REF (dr));
1098 bool is_packed = not_size_aligned (DR_REF (dr));
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "Unknown misalignment, is_packed = %d\n",is_packed);
1102 if ((TYPE_USER_ALIGN (type) && !is_packed)
1103 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1104 return true;
1105 else
1106 return false;
1109 return true;
1113 /* Calculate the cost of the memory access represented by DR. */
1115 static void
1116 vect_get_data_access_cost (struct data_reference *dr,
1117 unsigned int *inside_cost,
1118 unsigned int *outside_cost,
1119 stmt_vector_for_cost *body_cost_vec)
1121 gimple *stmt = DR_STMT (dr);
1122 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1123 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1124 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1125 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1126 int ncopies = vf / nunits;
1128 if (DR_IS_READ (dr))
1129 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1130 NULL, body_cost_vec, false);
1131 else
1132 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1134 if (dump_enabled_p ())
1135 dump_printf_loc (MSG_NOTE, vect_location,
1136 "vect_get_data_access_cost: inside_cost = %d, "
1137 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1141 typedef struct _vect_peel_info
1143 int npeel;
1144 struct data_reference *dr;
1145 unsigned int count;
1146 } *vect_peel_info;
1148 typedef struct _vect_peel_extended_info
1150 struct _vect_peel_info peel_info;
1151 unsigned int inside_cost;
1152 unsigned int outside_cost;
1153 stmt_vector_for_cost body_cost_vec;
1154 } *vect_peel_extended_info;
1157 /* Peeling hashtable helpers. */
1159 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1161 static inline hashval_t hash (const _vect_peel_info *);
1162 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1165 inline hashval_t
1166 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1168 return (hashval_t) peel_info->npeel;
1171 inline bool
1172 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1174 return (a->npeel == b->npeel);
1178 /* Insert DR into peeling hash table with NPEEL as key. */
1180 static void
1181 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1182 loop_vec_info loop_vinfo, struct data_reference *dr,
1183 int npeel)
1185 struct _vect_peel_info elem, *slot;
1186 _vect_peel_info **new_slot;
1187 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1189 elem.npeel = npeel;
1190 slot = peeling_htab->find (&elem);
1191 if (slot)
1192 slot->count++;
1193 else
1195 slot = XNEW (struct _vect_peel_info);
1196 slot->npeel = npeel;
1197 slot->dr = dr;
1198 slot->count = 1;
1199 new_slot = peeling_htab->find_slot (slot, INSERT);
1200 *new_slot = slot;
1203 if (!supportable_dr_alignment
1204 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1205 slot->count += VECT_MAX_COST;
1209 /* Traverse peeling hash table to find peeling option that aligns maximum
1210 number of data accesses. */
1213 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1214 _vect_peel_extended_info *max)
1216 vect_peel_info elem = *slot;
1218 if (elem->count > max->peel_info.count
1219 || (elem->count == max->peel_info.count
1220 && max->peel_info.npeel > elem->npeel))
1222 max->peel_info.npeel = elem->npeel;
1223 max->peel_info.count = elem->count;
1224 max->peel_info.dr = elem->dr;
1227 return 1;
1231 /* Traverse peeling hash table and calculate cost for each peeling option.
1232 Find the one with the lowest cost. */
1235 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1236 _vect_peel_extended_info *min)
1238 vect_peel_info elem = *slot;
1239 int save_misalignment, dummy;
1240 unsigned int inside_cost = 0, outside_cost = 0, i;
1241 gimple *stmt = DR_STMT (elem->dr);
1242 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1243 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1244 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1245 struct data_reference *dr;
1246 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1248 prologue_cost_vec.create (2);
1249 body_cost_vec.create (2);
1250 epilogue_cost_vec.create (2);
1252 FOR_EACH_VEC_ELT (datarefs, i, dr)
1254 stmt = DR_STMT (dr);
1255 stmt_info = vinfo_for_stmt (stmt);
1256 /* For interleaving, only the alignment of the first access
1257 matters. */
1258 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1259 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1260 continue;
1262 /* Strided accesses perform only component accesses, alignment is
1263 irrelevant for them. */
1264 if (STMT_VINFO_STRIDED_P (stmt_info)
1265 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1266 continue;
1268 save_misalignment = DR_MISALIGNMENT (dr);
1269 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1270 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1271 &body_cost_vec);
1272 SET_DR_MISALIGNMENT (dr, save_misalignment);
1275 outside_cost += vect_get_known_peeling_cost
1276 (loop_vinfo, elem->npeel, &dummy,
1277 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278 &prologue_cost_vec, &epilogue_cost_vec);
1280 /* Prologue and epilogue costs are added to the target model later.
1281 These costs depend only on the scalar iteration cost, the
1282 number of peeling iterations finally chosen, and the number of
1283 misaligned statements. So discard the information found here. */
1284 prologue_cost_vec.release ();
1285 epilogue_cost_vec.release ();
1287 if (inside_cost < min->inside_cost
1288 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1290 min->inside_cost = inside_cost;
1291 min->outside_cost = outside_cost;
1292 min->body_cost_vec.release ();
1293 min->body_cost_vec = body_cost_vec;
1294 min->peel_info.dr = elem->dr;
1295 min->peel_info.npeel = elem->npeel;
1297 else
1298 body_cost_vec.release ();
1300 return 1;
1304 /* Choose best peeling option by traversing peeling hash table and either
1305 choosing an option with the lowest cost (if cost model is enabled) or the
1306 option that aligns as many accesses as possible. */
1308 static struct data_reference *
1309 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1310 loop_vec_info loop_vinfo,
1311 unsigned int *npeel,
1312 stmt_vector_for_cost *body_cost_vec)
1314 struct _vect_peel_extended_info res;
1316 res.peel_info.dr = NULL;
1317 res.body_cost_vec = stmt_vector_for_cost ();
1319 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1321 res.inside_cost = INT_MAX;
1322 res.outside_cost = INT_MAX;
1323 peeling_htab->traverse <_vect_peel_extended_info *,
1324 vect_peeling_hash_get_lowest_cost> (&res);
1326 else
1328 res.peel_info.count = 0;
1329 peeling_htab->traverse <_vect_peel_extended_info *,
1330 vect_peeling_hash_get_most_frequent> (&res);
1333 *npeel = res.peel_info.npeel;
1334 *body_cost_vec = res.body_cost_vec;
1335 return res.peel_info.dr;
1339 /* Function vect_enhance_data_refs_alignment
1341 This pass will use loop versioning and loop peeling in order to enhance
1342 the alignment of data references in the loop.
1344 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1345 original loop is to be vectorized. Any other loops that are created by
1346 the transformations performed in this pass - are not supposed to be
1347 vectorized. This restriction will be relaxed.
1349 This pass will require a cost model to guide it whether to apply peeling
1350 or versioning or a combination of the two. For example, the scheme that
1351 intel uses when given a loop with several memory accesses, is as follows:
1352 choose one memory access ('p') which alignment you want to force by doing
1353 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1354 other accesses are not necessarily aligned, or (2) use loop versioning to
1355 generate one loop in which all accesses are aligned, and another loop in
1356 which only 'p' is necessarily aligned.
1358 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1359 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1360 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1362 Devising a cost model is the most critical aspect of this work. It will
1363 guide us on which access to peel for, whether to use loop versioning, how
1364 many versions to create, etc. The cost model will probably consist of
1365 generic considerations as well as target specific considerations (on
1366 powerpc for example, misaligned stores are more painful than misaligned
1367 loads).
1369 Here are the general steps involved in alignment enhancements:
1371 -- original loop, before alignment analysis:
1372 for (i=0; i<N; i++){
1373 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1374 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1377 -- After vect_compute_data_refs_alignment:
1378 for (i=0; i<N; i++){
1379 x = q[i]; # DR_MISALIGNMENT(q) = 3
1380 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1383 -- Possibility 1: we do loop versioning:
1384 if (p is aligned) {
1385 for (i=0; i<N; i++){ # loop 1A
1386 x = q[i]; # DR_MISALIGNMENT(q) = 3
1387 p[i] = y; # DR_MISALIGNMENT(p) = 0
1390 else {
1391 for (i=0; i<N; i++){ # loop 1B
1392 x = q[i]; # DR_MISALIGNMENT(q) = 3
1393 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1397 -- Possibility 2: we do loop peeling:
1398 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1399 x = q[i];
1400 p[i] = y;
1402 for (i = 3; i < N; i++){ # loop 2A
1403 x = q[i]; # DR_MISALIGNMENT(q) = 0
1404 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1407 -- Possibility 3: combination of loop peeling and versioning:
1408 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1409 x = q[i];
1410 p[i] = y;
1412 if (p is aligned) {
1413 for (i = 3; i<N; i++){ # loop 3A
1414 x = q[i]; # DR_MISALIGNMENT(q) = 0
1415 p[i] = y; # DR_MISALIGNMENT(p) = 0
1418 else {
1419 for (i = 3; i<N; i++){ # loop 3B
1420 x = q[i]; # DR_MISALIGNMENT(q) = 0
1421 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1425 These loops are later passed to loop_transform to be vectorized. The
1426 vectorizer will use the alignment information to guide the transformation
1427 (whether to generate regular loads/stores, or with special handling for
1428 misalignment). */
1430 bool
1431 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1433 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1434 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1435 enum dr_alignment_support supportable_dr_alignment;
1436 struct data_reference *dr0 = NULL, *first_store = NULL;
1437 struct data_reference *dr;
1438 unsigned int i, j;
1439 bool do_peeling = false;
1440 bool do_versioning = false;
1441 bool stat;
1442 gimple *stmt;
1443 stmt_vec_info stmt_info;
1444 unsigned int npeel = 0;
1445 bool all_misalignments_unknown = true;
1446 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1447 unsigned possible_npeel_number = 1;
1448 tree vectype;
1449 unsigned int nelements, mis, same_align_drs_max = 0;
1450 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1451 hash_table<peel_info_hasher> peeling_htab (1);
1453 if (dump_enabled_p ())
1454 dump_printf_loc (MSG_NOTE, vect_location,
1455 "=== vect_enhance_data_refs_alignment ===\n");
1457 /* Reset data so we can safely be called multiple times. */
1458 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1459 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1461 /* While cost model enhancements are expected in the future, the high level
1462 view of the code at this time is as follows:
1464 A) If there is a misaligned access then see if peeling to align
1465 this access can make all data references satisfy
1466 vect_supportable_dr_alignment. If so, update data structures
1467 as needed and return true.
1469 B) If peeling wasn't possible and there is a data reference with an
1470 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1471 then see if loop versioning checks can be used to make all data
1472 references satisfy vect_supportable_dr_alignment. If so, update
1473 data structures as needed and return true.
1475 C) If neither peeling nor versioning were successful then return false if
1476 any data reference does not satisfy vect_supportable_dr_alignment.
1478 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1480 Note, Possibility 3 above (which is peeling and versioning together) is not
1481 being done at this time. */
1483 /* (1) Peeling to force alignment. */
1485 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1486 Considerations:
1487 + How many accesses will become aligned due to the peeling
1488 - How many accesses will become unaligned due to the peeling,
1489 and the cost of misaligned accesses.
1490 - The cost of peeling (the extra runtime checks, the increase
1491 in code size). */
1493 FOR_EACH_VEC_ELT (datarefs, i, dr)
1495 stmt = DR_STMT (dr);
1496 stmt_info = vinfo_for_stmt (stmt);
1498 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1499 continue;
1501 /* For interleaving, only the alignment of the first access
1502 matters. */
1503 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1504 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1505 continue;
1507 /* For invariant accesses there is nothing to enhance. */
1508 if (integer_zerop (DR_STEP (dr)))
1509 continue;
1511 /* Strided accesses perform only component accesses, alignment is
1512 irrelevant for them. */
1513 if (STMT_VINFO_STRIDED_P (stmt_info)
1514 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1515 continue;
1517 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1518 do_peeling = vector_alignment_reachable_p (dr);
1519 if (do_peeling)
1521 if (known_alignment_for_access_p (dr))
1523 unsigned int npeel_tmp;
1524 bool negative = tree_int_cst_compare (DR_STEP (dr),
1525 size_zero_node) < 0;
1527 /* Save info about DR in the hash table. */
1528 vectype = STMT_VINFO_VECTYPE (stmt_info);
1529 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1530 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1531 TREE_TYPE (DR_REF (dr))));
1532 npeel_tmp = (negative
1533 ? (mis - nelements) : (nelements - mis))
1534 & (nelements - 1);
1536 /* For multiple types, it is possible that the bigger type access
1537 will have more than one peeling option. E.g., a loop with two
1538 types: one of size (vector size / 4), and the other one of
1539 size (vector size / 8). Vectorization factor will 8. If both
1540 access are misaligned by 3, the first one needs one scalar
1541 iteration to be aligned, and the second one needs 5. But the
1542 first one will be aligned also by peeling 5 scalar
1543 iterations, and in that case both accesses will be aligned.
1544 Hence, except for the immediate peeling amount, we also want
1545 to try to add full vector size, while we don't exceed
1546 vectorization factor.
1547 We do this automtically for cost model, since we calculate cost
1548 for every peeling option. */
1549 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1551 if (STMT_SLP_TYPE (stmt_info))
1552 possible_npeel_number
1553 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1554 else
1555 possible_npeel_number = vf / nelements;
1558 /* Handle the aligned case. We may decide to align some other
1559 access, making DR unaligned. */
1560 if (DR_MISALIGNMENT (dr) == 0)
1562 npeel_tmp = 0;
1563 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1564 possible_npeel_number++;
1567 for (j = 0; j < possible_npeel_number; j++)
1569 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1570 dr, npeel_tmp);
1571 npeel_tmp += nelements;
1574 all_misalignments_unknown = false;
1575 /* Data-ref that was chosen for the case that all the
1576 misalignments are unknown is not relevant anymore, since we
1577 have a data-ref with known alignment. */
1578 dr0 = NULL;
1580 else
1582 /* If we don't know any misalignment values, we prefer
1583 peeling for data-ref that has the maximum number of data-refs
1584 with the same alignment, unless the target prefers to align
1585 stores over load. */
1586 if (all_misalignments_unknown)
1588 unsigned same_align_drs
1589 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1590 if (!dr0
1591 || same_align_drs_max < same_align_drs)
1593 same_align_drs_max = same_align_drs;
1594 dr0 = dr;
1596 /* For data-refs with the same number of related
1597 accesses prefer the one where the misalign
1598 computation will be invariant in the outermost loop. */
1599 else if (same_align_drs_max == same_align_drs)
1601 struct loop *ivloop0, *ivloop;
1602 ivloop0 = outermost_invariant_loop_for_expr
1603 (loop, DR_BASE_ADDRESS (dr0));
1604 ivloop = outermost_invariant_loop_for_expr
1605 (loop, DR_BASE_ADDRESS (dr));
1606 if ((ivloop && !ivloop0)
1607 || (ivloop && ivloop0
1608 && flow_loop_nested_p (ivloop, ivloop0)))
1609 dr0 = dr;
1612 if (!first_store && DR_IS_WRITE (dr))
1613 first_store = dr;
1616 /* If there are both known and unknown misaligned accesses in the
1617 loop, we choose peeling amount according to the known
1618 accesses. */
1619 if (!supportable_dr_alignment)
1621 dr0 = dr;
1622 if (!first_store && DR_IS_WRITE (dr))
1623 first_store = dr;
1627 else
1629 if (!aligned_access_p (dr))
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 "vector alignment may not be reachable\n");
1634 break;
1639 /* Check if we can possibly peel the loop. */
1640 if (!vect_can_advance_ivs_p (loop_vinfo)
1641 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1642 || loop->inner)
1643 do_peeling = false;
1645 if (do_peeling
1646 && all_misalignments_unknown
1647 && vect_supportable_dr_alignment (dr0, false))
1649 /* Check if the target requires to prefer stores over loads, i.e., if
1650 misaligned stores are more expensive than misaligned loads (taking
1651 drs with same alignment into account). */
1652 if (first_store && DR_IS_READ (dr0))
1654 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1655 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1656 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1657 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1658 stmt_vector_for_cost dummy;
1659 dummy.create (2);
1661 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1662 &dummy);
1663 vect_get_data_access_cost (first_store, &store_inside_cost,
1664 &store_outside_cost, &dummy);
1666 dummy.release ();
1668 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1669 aligning the load DR0). */
1670 load_inside_penalty = store_inside_cost;
1671 load_outside_penalty = store_outside_cost;
1672 for (i = 0;
1673 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1674 DR_STMT (first_store))).iterate (i, &dr);
1675 i++)
1676 if (DR_IS_READ (dr))
1678 load_inside_penalty += load_inside_cost;
1679 load_outside_penalty += load_outside_cost;
1681 else
1683 load_inside_penalty += store_inside_cost;
1684 load_outside_penalty += store_outside_cost;
1687 /* Calculate the penalty for leaving DR0 unaligned (by
1688 aligning the FIRST_STORE). */
1689 store_inside_penalty = load_inside_cost;
1690 store_outside_penalty = load_outside_cost;
1691 for (i = 0;
1692 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1693 DR_STMT (dr0))).iterate (i, &dr);
1694 i++)
1695 if (DR_IS_READ (dr))
1697 store_inside_penalty += load_inside_cost;
1698 store_outside_penalty += load_outside_cost;
1700 else
1702 store_inside_penalty += store_inside_cost;
1703 store_outside_penalty += store_outside_cost;
1706 if (load_inside_penalty > store_inside_penalty
1707 || (load_inside_penalty == store_inside_penalty
1708 && load_outside_penalty > store_outside_penalty))
1709 dr0 = first_store;
1712 /* In case there are only loads with different unknown misalignments, use
1713 peeling only if it may help to align other accesses in the loop or
1714 if it may help improving load bandwith when we'd end up using
1715 unaligned loads. */
1716 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1717 if (!first_store
1718 && !STMT_VINFO_SAME_ALIGN_REFS (
1719 vinfo_for_stmt (DR_STMT (dr0))).length ()
1720 && (vect_supportable_dr_alignment (dr0, false)
1721 != dr_unaligned_supported
1722 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1723 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1724 do_peeling = false;
1727 if (do_peeling && !dr0)
1729 /* Peeling is possible, but there is no data access that is not supported
1730 unless aligned. So we try to choose the best possible peeling. */
1732 /* We should get here only if there are drs with known misalignment. */
1733 gcc_assert (!all_misalignments_unknown);
1735 /* Choose the best peeling from the hash table. */
1736 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1737 loop_vinfo, &npeel,
1738 &body_cost_vec);
1739 if (!dr0 || !npeel)
1740 do_peeling = false;
1743 if (do_peeling)
1745 stmt = DR_STMT (dr0);
1746 stmt_info = vinfo_for_stmt (stmt);
1747 vectype = STMT_VINFO_VECTYPE (stmt_info);
1748 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1750 if (known_alignment_for_access_p (dr0))
1752 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1753 size_zero_node) < 0;
1754 if (!npeel)
1756 /* Since it's known at compile time, compute the number of
1757 iterations in the peeled loop (the peeling factor) for use in
1758 updating DR_MISALIGNMENT values. The peeling factor is the
1759 vectorization factor minus the misalignment as an element
1760 count. */
1761 mis = DR_MISALIGNMENT (dr0);
1762 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1763 npeel = ((negative ? mis - nelements : nelements - mis)
1764 & (nelements - 1));
1767 /* For interleaved data access every iteration accesses all the
1768 members of the group, therefore we divide the number of iterations
1769 by the group size. */
1770 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1771 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1772 npeel /= GROUP_SIZE (stmt_info);
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "Try peeling by %d\n", npeel);
1779 /* Ensure that all data refs can be vectorized after the peel. */
1780 FOR_EACH_VEC_ELT (datarefs, i, dr)
1782 int save_misalignment;
1784 if (dr == dr0)
1785 continue;
1787 stmt = DR_STMT (dr);
1788 stmt_info = vinfo_for_stmt (stmt);
1789 /* For interleaving, only the alignment of the first access
1790 matters. */
1791 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1792 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1793 continue;
1795 /* Strided accesses perform only component accesses, alignment is
1796 irrelevant for them. */
1797 if (STMT_VINFO_STRIDED_P (stmt_info)
1798 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1799 continue;
1801 save_misalignment = DR_MISALIGNMENT (dr);
1802 vect_update_misalignment_for_peel (dr, dr0, npeel);
1803 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1804 SET_DR_MISALIGNMENT (dr, save_misalignment);
1806 if (!supportable_dr_alignment)
1808 do_peeling = false;
1809 break;
1813 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1815 stat = vect_verify_datarefs_alignment (loop_vinfo);
1816 if (!stat)
1817 do_peeling = false;
1818 else
1820 body_cost_vec.release ();
1821 return stat;
1825 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1826 if (do_peeling)
1828 unsigned max_allowed_peel
1829 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1830 if (max_allowed_peel != (unsigned)-1)
1832 unsigned max_peel = npeel;
1833 if (max_peel == 0)
1835 gimple *dr_stmt = DR_STMT (dr0);
1836 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1837 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1838 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1840 if (max_peel > max_allowed_peel)
1842 do_peeling = false;
1843 if (dump_enabled_p ())
1844 dump_printf_loc (MSG_NOTE, vect_location,
1845 "Disable peeling, max peels reached: %d\n", max_peel);
1850 /* Cost model #2 - if peeling may result in a remaining loop not
1851 iterating enough to be vectorized then do not peel. */
1852 if (do_peeling
1853 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1855 unsigned max_peel
1856 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1857 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1858 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1859 do_peeling = false;
1862 if (do_peeling)
1864 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1865 If the misalignment of DR_i is identical to that of dr0 then set
1866 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1867 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1868 by the peeling factor times the element size of DR_i (MOD the
1869 vectorization factor times the size). Otherwise, the
1870 misalignment of DR_i must be set to unknown. */
1871 FOR_EACH_VEC_ELT (datarefs, i, dr)
1872 if (dr != dr0)
1874 /* Strided accesses perform only component accesses, alignment
1875 is irrelevant for them. */
1876 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1877 if (STMT_VINFO_STRIDED_P (stmt_info)
1878 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1879 continue;
1881 vect_update_misalignment_for_peel (dr, dr0, npeel);
1884 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1885 if (npeel)
1886 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1887 else
1888 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1889 = DR_MISALIGNMENT (dr0);
1890 SET_DR_MISALIGNMENT (dr0, 0);
1891 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_NOTE, vect_location,
1894 "Alignment of access forced using peeling.\n");
1895 dump_printf_loc (MSG_NOTE, vect_location,
1896 "Peeling for alignment will be applied.\n");
1898 /* The inside-loop cost will be accounted for in vectorizable_load
1899 and vectorizable_store correctly with adjusted alignments.
1900 Drop the body_cst_vec on the floor here. */
1901 body_cost_vec.release ();
1903 stat = vect_verify_datarefs_alignment (loop_vinfo);
1904 gcc_assert (stat);
1905 return stat;
1909 body_cost_vec.release ();
1911 /* (2) Versioning to force alignment. */
1913 /* Try versioning if:
1914 1) optimize loop for speed
1915 2) there is at least one unsupported misaligned data ref with an unknown
1916 misalignment, and
1917 3) all misaligned data refs with a known misalignment are supported, and
1918 4) the number of runtime alignment checks is within reason. */
1920 do_versioning =
1921 optimize_loop_nest_for_speed_p (loop)
1922 && (!loop->inner); /* FORNOW */
1924 if (do_versioning)
1926 FOR_EACH_VEC_ELT (datarefs, i, dr)
1928 stmt = DR_STMT (dr);
1929 stmt_info = vinfo_for_stmt (stmt);
1931 /* For interleaving, only the alignment of the first access
1932 matters. */
1933 if (aligned_access_p (dr)
1934 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1935 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1936 continue;
1938 if (STMT_VINFO_STRIDED_P (stmt_info))
1940 /* Strided loads perform only component accesses, alignment is
1941 irrelevant for them. */
1942 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1943 continue;
1944 do_versioning = false;
1945 break;
1948 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1950 if (!supportable_dr_alignment)
1952 gimple *stmt;
1953 int mask;
1954 tree vectype;
1956 if (known_alignment_for_access_p (dr)
1957 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1958 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1960 do_versioning = false;
1961 break;
1964 stmt = DR_STMT (dr);
1965 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1966 gcc_assert (vectype);
1968 /* The rightmost bits of an aligned address must be zeros.
1969 Construct the mask needed for this test. For example,
1970 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1971 mask must be 15 = 0xf. */
1972 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1974 /* FORNOW: use the same mask to test all potentially unaligned
1975 references in the loop. The vectorizer currently supports
1976 a single vector size, see the reference to
1977 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1978 vectorization factor is computed. */
1979 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1980 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1981 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1982 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1983 DR_STMT (dr));
1987 /* Versioning requires at least one misaligned data reference. */
1988 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1989 do_versioning = false;
1990 else if (!do_versioning)
1991 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1994 if (do_versioning)
1996 vec<gimple *> may_misalign_stmts
1997 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1998 gimple *stmt;
2000 /* It can now be assumed that the data references in the statements
2001 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2002 of the loop being vectorized. */
2003 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2005 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2006 dr = STMT_VINFO_DATA_REF (stmt_info);
2007 SET_DR_MISALIGNMENT (dr, 0);
2008 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_NOTE, vect_location,
2010 "Alignment of access forced using versioning.\n");
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE, vect_location,
2015 "Versioning for alignment will be applied.\n");
2017 /* Peeling and versioning can't be done together at this time. */
2018 gcc_assert (! (do_peeling && do_versioning));
2020 stat = vect_verify_datarefs_alignment (loop_vinfo);
2021 gcc_assert (stat);
2022 return stat;
2025 /* This point is reached if neither peeling nor versioning is being done. */
2026 gcc_assert (! (do_peeling || do_versioning));
2028 stat = vect_verify_datarefs_alignment (loop_vinfo);
2029 return stat;
2033 /* Function vect_find_same_alignment_drs.
2035 Update group and alignment relations according to the chosen
2036 vectorization factor. */
2038 static void
2039 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2040 loop_vec_info loop_vinfo)
2042 unsigned int i;
2043 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2044 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2045 struct data_reference *dra = DDR_A (ddr);
2046 struct data_reference *drb = DDR_B (ddr);
2047 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2048 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2049 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2050 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2051 lambda_vector dist_v;
2052 unsigned int loop_depth;
2054 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2055 return;
2057 if (dra == drb)
2058 return;
2060 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2061 return;
2063 /* Loop-based vectorization and known data dependence. */
2064 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2065 return;
2067 /* Data-dependence analysis reports a distance vector of zero
2068 for data-references that overlap only in the first iteration
2069 but have different sign step (see PR45764).
2070 So as a sanity check require equal DR_STEP. */
2071 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2072 return;
2074 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2075 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2077 int dist = dist_v[loop_depth];
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_NOTE, vect_location,
2081 "dependence distance = %d.\n", dist);
2083 /* Same loop iteration. */
2084 if (dist == 0
2085 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2087 /* Two references with distance zero have the same alignment. */
2088 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2089 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2090 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE, vect_location,
2093 "accesses have the same alignment.\n");
2094 dump_printf (MSG_NOTE,
2095 "dependence distance modulo vf == 0 between ");
2096 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2097 dump_printf (MSG_NOTE, " and ");
2098 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2099 dump_printf (MSG_NOTE, "\n");
2106 /* Function vect_analyze_data_refs_alignment
2108 Analyze the alignment of the data-references in the loop.
2109 Return FALSE if a data reference is found that cannot be vectorized. */
2111 bool
2112 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_NOTE, vect_location,
2116 "=== vect_analyze_data_refs_alignment ===\n");
2118 /* Mark groups of data references with same alignment using
2119 data dependence information. */
2120 vec<ddr_p> ddrs = vinfo->ddrs;
2121 struct data_dependence_relation *ddr;
2122 unsigned int i;
2124 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2125 vect_find_same_alignment_drs (ddr, vinfo);
2127 vec<data_reference_p> datarefs = vinfo->datarefs;
2128 struct data_reference *dr;
2130 FOR_EACH_VEC_ELT (datarefs, i, dr)
2132 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2133 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2134 && !vect_compute_data_ref_alignment (dr))
2136 /* Strided accesses perform only component accesses, misalignment
2137 information is irrelevant for them. */
2138 if (STMT_VINFO_STRIDED_P (stmt_info)
2139 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2140 continue;
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 "not vectorized: can't calculate alignment "
2145 "for data ref.\n");
2147 return false;
2151 return true;
2155 /* Analyze alignment of DRs of stmts in NODE. */
2157 static bool
2158 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2160 /* We vectorize from the first scalar stmt in the node unless
2161 the node is permuted in which case we start from the first
2162 element in the group. */
2163 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2164 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2165 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2166 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2168 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2169 if (! vect_compute_data_ref_alignment (dr)
2170 /* For creating the data-ref pointer we need alignment of the
2171 first element anyway. */
2172 || (dr != first_dr
2173 && ! vect_compute_data_ref_alignment (first_dr))
2174 || ! verify_data_ref_alignment (dr))
2176 if (dump_enabled_p ())
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 "not vectorized: bad data alignment in basic "
2179 "block.\n");
2180 return false;
2183 return true;
2186 /* Function vect_slp_analyze_instance_alignment
2188 Analyze the alignment of the data-references in the SLP instance.
2189 Return FALSE if a data reference is found that cannot be vectorized. */
2191 bool
2192 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2194 if (dump_enabled_p ())
2195 dump_printf_loc (MSG_NOTE, vect_location,
2196 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2198 slp_tree node;
2199 unsigned i;
2200 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2201 if (! vect_slp_analyze_and_verify_node_alignment (node))
2202 return false;
2204 node = SLP_INSTANCE_TREE (instance);
2205 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2206 && ! vect_slp_analyze_and_verify_node_alignment
2207 (SLP_INSTANCE_TREE (instance)))
2208 return false;
2210 return true;
2214 /* Analyze groups of accesses: check that DR belongs to a group of
2215 accesses of legal size, step, etc. Detect gaps, single element
2216 interleaving, and other special cases. Set grouped access info.
2217 Collect groups of strided stores for further use in SLP analysis.
2218 Worker for vect_analyze_group_access. */
2220 static bool
2221 vect_analyze_group_access_1 (struct data_reference *dr)
2223 tree step = DR_STEP (dr);
2224 tree scalar_type = TREE_TYPE (DR_REF (dr));
2225 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2226 gimple *stmt = DR_STMT (dr);
2227 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2228 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2229 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2230 HOST_WIDE_INT dr_step = -1;
2231 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2232 bool slp_impossible = false;
2234 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2235 size of the interleaving group (including gaps). */
2236 if (tree_fits_shwi_p (step))
2238 dr_step = tree_to_shwi (step);
2239 /* Check that STEP is a multiple of type size. Otherwise there is
2240 a non-element-sized gap at the end of the group which we
2241 cannot represent in GROUP_GAP or GROUP_SIZE.
2242 ??? As we can handle non-constant step fine here we should
2243 simply remove uses of GROUP_GAP between the last and first
2244 element and instead rely on DR_STEP. GROUP_SIZE then would
2245 simply not include that gap. */
2246 if ((dr_step % type_size) != 0)
2248 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "Step ");
2252 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2253 dump_printf (MSG_NOTE,
2254 " is not a multiple of the element size for ");
2255 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2256 dump_printf (MSG_NOTE, "\n");
2258 return false;
2260 groupsize = absu_hwi (dr_step) / type_size;
2262 else
2263 groupsize = 0;
2265 /* Not consecutive access is possible only if it is a part of interleaving. */
2266 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2268 /* Check if it this DR is a part of interleaving, and is a single
2269 element of the group that is accessed in the loop. */
2271 /* Gaps are supported only for loads. STEP must be a multiple of the type
2272 size. The size of the group must be a power of 2. */
2273 if (DR_IS_READ (dr)
2274 && (dr_step % type_size) == 0
2275 && groupsize > 0
2276 && pow2p_hwi (groupsize))
2278 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2279 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2280 GROUP_GAP (stmt_info) = groupsize - 1;
2281 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_NOTE, vect_location,
2284 "Detected single element interleaving ");
2285 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2286 dump_printf (MSG_NOTE, " step ");
2287 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2288 dump_printf (MSG_NOTE, "\n");
2291 return true;
2294 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297 "not consecutive access ");
2298 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2301 if (bb_vinfo)
2303 /* Mark the statement as unvectorizable. */
2304 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2305 return true;
2308 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2309 STMT_VINFO_STRIDED_P (stmt_info) = true;
2310 return true;
2313 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2315 /* First stmt in the interleaving chain. Check the chain. */
2316 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2317 struct data_reference *data_ref = dr;
2318 unsigned int count = 1;
2319 tree prev_init = DR_INIT (data_ref);
2320 gimple *prev = stmt;
2321 HOST_WIDE_INT diff, gaps = 0;
2323 while (next)
2325 /* Skip same data-refs. In case that two or more stmts share
2326 data-ref (supported only for loads), we vectorize only the first
2327 stmt, and the rest get their vectorized loads from the first
2328 one. */
2329 if (!tree_int_cst_compare (DR_INIT (data_ref),
2330 DR_INIT (STMT_VINFO_DATA_REF (
2331 vinfo_for_stmt (next)))))
2333 if (DR_IS_WRITE (data_ref))
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 "Two store stmts share the same dr.\n");
2338 return false;
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "Two or more load stmts share the same dr.\n");
2345 /* For load use the same data-ref load. */
2346 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2348 prev = next;
2349 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2350 continue;
2353 prev = next;
2354 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2356 /* All group members have the same STEP by construction. */
2357 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2359 /* Check that the distance between two accesses is equal to the type
2360 size. Otherwise, we have gaps. */
2361 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2362 - TREE_INT_CST_LOW (prev_init)) / type_size;
2363 if (diff != 1)
2365 /* FORNOW: SLP of accesses with gaps is not supported. */
2366 slp_impossible = true;
2367 if (DR_IS_WRITE (data_ref))
2369 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371 "interleaved store with gaps\n");
2372 return false;
2375 gaps += diff - 1;
2378 last_accessed_element += diff;
2380 /* Store the gap from the previous member of the group. If there is no
2381 gap in the access, GROUP_GAP is always 1. */
2382 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2384 prev_init = DR_INIT (data_ref);
2385 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2386 /* Count the number of data-refs in the chain. */
2387 count++;
2390 if (groupsize == 0)
2391 groupsize = count + gaps;
2393 if (groupsize > UINT_MAX)
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397 "group is too large\n");
2398 return false;
2401 /* Check that the size of the interleaving is equal to count for stores,
2402 i.e., that there are no gaps. */
2403 if (groupsize != count
2404 && !DR_IS_READ (dr))
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408 "interleaved store with gaps\n");
2409 return false;
2412 /* If there is a gap after the last load in the group it is the
2413 difference between the groupsize and the last accessed
2414 element.
2415 When there is no gap, this difference should be 0. */
2416 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2418 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2419 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_NOTE, vect_location,
2422 "Detected interleaving ");
2423 if (DR_IS_READ (dr))
2424 dump_printf (MSG_NOTE, "load ");
2425 else
2426 dump_printf (MSG_NOTE, "store ");
2427 dump_printf (MSG_NOTE, "of size %u starting with ",
2428 (unsigned)groupsize);
2429 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2430 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2431 dump_printf_loc (MSG_NOTE, vect_location,
2432 "There is a gap of %u elements after the group\n",
2433 GROUP_GAP (vinfo_for_stmt (stmt)));
2436 /* SLP: create an SLP data structure for every interleaving group of
2437 stores for further analysis in vect_analyse_slp. */
2438 if (DR_IS_WRITE (dr) && !slp_impossible)
2440 if (loop_vinfo)
2441 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2442 if (bb_vinfo)
2443 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2447 return true;
2450 /* Analyze groups of accesses: check that DR belongs to a group of
2451 accesses of legal size, step, etc. Detect gaps, single element
2452 interleaving, and other special cases. Set grouped access info.
2453 Collect groups of strided stores for further use in SLP analysis. */
2455 static bool
2456 vect_analyze_group_access (struct data_reference *dr)
2458 if (!vect_analyze_group_access_1 (dr))
2460 /* Dissolve the group if present. */
2461 gimple *next;
2462 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2463 while (stmt)
2465 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2466 next = GROUP_NEXT_ELEMENT (vinfo);
2467 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2468 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2469 stmt = next;
2471 return false;
2473 return true;
2476 /* Analyze the access pattern of the data-reference DR.
2477 In case of non-consecutive accesses call vect_analyze_group_access() to
2478 analyze groups of accesses. */
2480 static bool
2481 vect_analyze_data_ref_access (struct data_reference *dr)
2483 tree step = DR_STEP (dr);
2484 tree scalar_type = TREE_TYPE (DR_REF (dr));
2485 gimple *stmt = DR_STMT (dr);
2486 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2487 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2488 struct loop *loop = NULL;
2490 if (loop_vinfo)
2491 loop = LOOP_VINFO_LOOP (loop_vinfo);
2493 if (loop_vinfo && !step)
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2497 "bad data-ref access in loop\n");
2498 return false;
2501 /* Allow loads with zero step in inner-loop vectorization. */
2502 if (loop_vinfo && integer_zerop (step))
2504 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2505 if (!nested_in_vect_loop_p (loop, stmt))
2506 return DR_IS_READ (dr);
2507 /* Allow references with zero step for outer loops marked
2508 with pragma omp simd only - it guarantees absence of
2509 loop-carried dependencies between inner loop iterations. */
2510 if (!loop->force_vectorize)
2512 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_NOTE, vect_location,
2514 "zero step in inner loop of nest\n");
2515 return false;
2519 if (loop && nested_in_vect_loop_p (loop, stmt))
2521 /* Interleaved accesses are not yet supported within outer-loop
2522 vectorization for references in the inner-loop. */
2523 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2525 /* For the rest of the analysis we use the outer-loop step. */
2526 step = STMT_VINFO_DR_STEP (stmt_info);
2527 if (integer_zerop (step))
2529 if (dump_enabled_p ())
2530 dump_printf_loc (MSG_NOTE, vect_location,
2531 "zero step in outer loop.\n");
2532 return DR_IS_READ (dr);
2536 /* Consecutive? */
2537 if (TREE_CODE (step) == INTEGER_CST)
2539 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2540 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2541 || (dr_step < 0
2542 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2544 /* Mark that it is not interleaving. */
2545 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2546 return true;
2550 if (loop && nested_in_vect_loop_p (loop, stmt))
2552 if (dump_enabled_p ())
2553 dump_printf_loc (MSG_NOTE, vect_location,
2554 "grouped access in outer loop.\n");
2555 return false;
2559 /* Assume this is a DR handled by non-constant strided load case. */
2560 if (TREE_CODE (step) != INTEGER_CST)
2561 return (STMT_VINFO_STRIDED_P (stmt_info)
2562 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2563 || vect_analyze_group_access (dr)));
2565 /* Not consecutive access - check if it's a part of interleaving group. */
2566 return vect_analyze_group_access (dr);
2571 /* A helper function used in the comparator function to sort data
2572 references. T1 and T2 are two data references to be compared.
2573 The function returns -1, 0, or 1. */
2575 static int
2576 compare_tree (tree t1, tree t2)
2578 int i, cmp;
2579 enum tree_code code;
2580 char tclass;
2582 if (t1 == t2)
2583 return 0;
2584 if (t1 == NULL)
2585 return -1;
2586 if (t2 == NULL)
2587 return 1;
2589 STRIP_NOPS (t1);
2590 STRIP_NOPS (t2);
2592 if (TREE_CODE (t1) != TREE_CODE (t2))
2593 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2595 code = TREE_CODE (t1);
2596 switch (code)
2598 /* For const values, we can just use hash values for comparisons. */
2599 case INTEGER_CST:
2600 case REAL_CST:
2601 case FIXED_CST:
2602 case STRING_CST:
2603 case COMPLEX_CST:
2604 case VECTOR_CST:
2606 hashval_t h1 = iterative_hash_expr (t1, 0);
2607 hashval_t h2 = iterative_hash_expr (t2, 0);
2608 if (h1 != h2)
2609 return h1 < h2 ? -1 : 1;
2610 break;
2613 case SSA_NAME:
2614 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2615 if (cmp != 0)
2616 return cmp;
2618 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2619 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2620 break;
2622 default:
2623 tclass = TREE_CODE_CLASS (code);
2625 /* For var-decl, we could compare their UIDs. */
2626 if (tclass == tcc_declaration)
2628 if (DECL_UID (t1) != DECL_UID (t2))
2629 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2630 break;
2633 /* For expressions with operands, compare their operands recursively. */
2634 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2636 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2637 if (cmp != 0)
2638 return cmp;
2642 return 0;
2646 /* Compare two data-references DRA and DRB to group them into chunks
2647 suitable for grouping. */
2649 static int
2650 dr_group_sort_cmp (const void *dra_, const void *drb_)
2652 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2653 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2654 int cmp;
2656 /* Stabilize sort. */
2657 if (dra == drb)
2658 return 0;
2660 /* DRs in different loops never belong to the same group. */
2661 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2662 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2663 if (loopa != loopb)
2664 return loopa->num < loopb->num ? -1 : 1;
2666 /* Ordering of DRs according to base. */
2667 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2669 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2670 if (cmp != 0)
2671 return cmp;
2674 /* And according to DR_OFFSET. */
2675 if (!dr_equal_offsets_p (dra, drb))
2677 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2678 if (cmp != 0)
2679 return cmp;
2682 /* Put reads before writes. */
2683 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2684 return DR_IS_READ (dra) ? -1 : 1;
2686 /* Then sort after access size. */
2687 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2688 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2690 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2691 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2692 if (cmp != 0)
2693 return cmp;
2696 /* And after step. */
2697 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2699 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2700 if (cmp != 0)
2701 return cmp;
2704 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2705 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2706 if (cmp == 0)
2707 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2708 return cmp;
2711 /* Function vect_analyze_data_ref_accesses.
2713 Analyze the access pattern of all the data references in the loop.
2715 FORNOW: the only access pattern that is considered vectorizable is a
2716 simple step 1 (consecutive) access.
2718 FORNOW: handle only arrays and pointer accesses. */
2720 bool
2721 vect_analyze_data_ref_accesses (vec_info *vinfo)
2723 unsigned int i;
2724 vec<data_reference_p> datarefs = vinfo->datarefs;
2725 struct data_reference *dr;
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE, vect_location,
2729 "=== vect_analyze_data_ref_accesses ===\n");
2731 if (datarefs.is_empty ())
2732 return true;
2734 /* Sort the array of datarefs to make building the interleaving chains
2735 linear. Don't modify the original vector's order, it is needed for
2736 determining what dependencies are reversed. */
2737 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2738 datarefs_copy.qsort (dr_group_sort_cmp);
2740 /* Build the interleaving chains. */
2741 for (i = 0; i < datarefs_copy.length () - 1;)
2743 data_reference_p dra = datarefs_copy[i];
2744 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2745 stmt_vec_info lastinfo = NULL;
2746 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2748 ++i;
2749 continue;
2751 for (i = i + 1; i < datarefs_copy.length (); ++i)
2753 data_reference_p drb = datarefs_copy[i];
2754 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2755 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2756 break;
2758 /* ??? Imperfect sorting (non-compatible types, non-modulo
2759 accesses, same accesses) can lead to a group to be artificially
2760 split here as we don't just skip over those. If it really
2761 matters we can push those to a worklist and re-iterate
2762 over them. The we can just skip ahead to the next DR here. */
2764 /* DRs in a different loop should not be put into the same
2765 interleaving group. */
2766 if (gimple_bb (DR_STMT (dra))->loop_father
2767 != gimple_bb (DR_STMT (drb))->loop_father)
2768 break;
2770 /* Check that the data-refs have same first location (except init)
2771 and they are both either store or load (not load and store,
2772 not masked loads or stores). */
2773 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2774 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2775 DR_BASE_ADDRESS (drb), 0)
2776 || !dr_equal_offsets_p (dra, drb)
2777 || !gimple_assign_single_p (DR_STMT (dra))
2778 || !gimple_assign_single_p (DR_STMT (drb)))
2779 break;
2781 /* Check that the data-refs have the same constant size. */
2782 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2783 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2784 if (!tree_fits_uhwi_p (sza)
2785 || !tree_fits_uhwi_p (szb)
2786 || !tree_int_cst_equal (sza, szb))
2787 break;
2789 /* Check that the data-refs have the same step. */
2790 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2791 break;
2793 /* Do not place the same access in the interleaving chain twice. */
2794 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2795 break;
2797 /* Check the types are compatible.
2798 ??? We don't distinguish this during sorting. */
2799 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2800 TREE_TYPE (DR_REF (drb))))
2801 break;
2803 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2804 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2805 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2806 gcc_assert (init_a <= init_b);
2808 /* If init_b == init_a + the size of the type * k, we have an
2809 interleaving, and DRA is accessed before DRB. */
2810 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2811 if (type_size_a == 0
2812 || (init_b - init_a) % type_size_a != 0)
2813 break;
2815 /* If we have a store, the accesses are adjacent. This splits
2816 groups into chunks we support (we don't support vectorization
2817 of stores with gaps). */
2818 if (!DR_IS_READ (dra)
2819 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2820 (DR_INIT (datarefs_copy[i-1]))
2821 != type_size_a))
2822 break;
2824 /* If the step (if not zero or non-constant) is greater than the
2825 difference between data-refs' inits this splits groups into
2826 suitable sizes. */
2827 if (tree_fits_shwi_p (DR_STEP (dra)))
2829 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2830 if (step != 0 && step <= (init_b - init_a))
2831 break;
2834 if (dump_enabled_p ())
2836 dump_printf_loc (MSG_NOTE, vect_location,
2837 "Detected interleaving ");
2838 if (DR_IS_READ (dra))
2839 dump_printf (MSG_NOTE, "load ");
2840 else
2841 dump_printf (MSG_NOTE, "store ");
2842 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2843 dump_printf (MSG_NOTE, " and ");
2844 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2845 dump_printf (MSG_NOTE, "\n");
2848 /* Link the found element into the group list. */
2849 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2851 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2852 lastinfo = stmtinfo_a;
2854 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2855 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2856 lastinfo = stmtinfo_b;
2860 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2861 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2862 && !vect_analyze_data_ref_access (dr))
2864 if (dump_enabled_p ())
2865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2866 "not vectorized: complicated access pattern.\n");
2868 if (is_a <bb_vec_info> (vinfo))
2870 /* Mark the statement as not vectorizable. */
2871 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2872 continue;
2874 else
2876 datarefs_copy.release ();
2877 return false;
2881 datarefs_copy.release ();
2882 return true;
2886 /* Operator == between two dr_with_seg_len objects.
2888 This equality operator is used to make sure two data refs
2889 are the same one so that we will consider to combine the
2890 aliasing checks of those two pairs of data dependent data
2891 refs. */
2893 static bool
2894 operator == (const dr_with_seg_len& d1,
2895 const dr_with_seg_len& d2)
2897 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2898 DR_BASE_ADDRESS (d2.dr), 0)
2899 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2900 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2901 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2904 /* Function comp_dr_with_seg_len_pair.
2906 Comparison function for sorting objects of dr_with_seg_len_pair_t
2907 so that we can combine aliasing checks in one scan. */
2909 static int
2910 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2912 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2913 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2914 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2915 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2917 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2918 if a and c have the same basic address snd step, and b and d have the same
2919 address and step. Therefore, if any a&c or b&d don't have the same address
2920 and step, we don't care the order of those two pairs after sorting. */
2921 int comp_res;
2923 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2924 DR_BASE_ADDRESS (b1.dr))) != 0)
2925 return comp_res;
2926 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2927 DR_BASE_ADDRESS (b2.dr))) != 0)
2928 return comp_res;
2929 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2930 return comp_res;
2931 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2932 return comp_res;
2933 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2934 return comp_res;
2935 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2936 return comp_res;
2937 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2938 return comp_res;
2939 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2940 return comp_res;
2942 return 0;
2945 /* Function vect_vfa_segment_size.
2947 Create an expression that computes the size of segment
2948 that will be accessed for a data reference. The functions takes into
2949 account that realignment loads may access one more vector.
2951 Input:
2952 DR: The data reference.
2953 LENGTH_FACTOR: segment length to consider.
2955 Return an expression whose value is the size of segment which will be
2956 accessed by DR. */
2958 static tree
2959 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2961 tree segment_length;
2963 if (integer_zerop (DR_STEP (dr)))
2964 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2965 else
2966 segment_length = size_binop (MULT_EXPR,
2967 fold_convert (sizetype, DR_STEP (dr)),
2968 fold_convert (sizetype, length_factor));
2970 if (vect_supportable_dr_alignment (dr, false)
2971 == dr_explicit_realign_optimized)
2973 tree vector_size = TYPE_SIZE_UNIT
2974 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2976 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2978 return segment_length;
2981 /* Function vect_no_alias_p.
2983 Given data references A and B with equal base and offset, the alias
2984 relation can be decided at compilation time, return TRUE if they do
2985 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2986 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2987 respectively. */
2989 static bool
2990 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2991 tree segment_length_a, tree segment_length_b)
2993 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2994 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2995 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2996 return false;
2998 tree seg_a_min = DR_INIT (a);
2999 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3000 seg_a_min, segment_length_a);
3001 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3002 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3003 [a, a+12) */
3004 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3006 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3007 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3008 seg_a_max, unit_size);
3009 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3010 DR_INIT (a), unit_size);
3012 tree seg_b_min = DR_INIT (b);
3013 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3014 seg_b_min, segment_length_b);
3015 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3017 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3018 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3019 seg_b_max, unit_size);
3020 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3021 DR_INIT (b), unit_size);
3024 if (tree_int_cst_le (seg_a_max, seg_b_min)
3025 || tree_int_cst_le (seg_b_max, seg_a_min))
3026 return true;
3028 return false;
3031 /* Function vect_prune_runtime_alias_test_list.
3033 Prune a list of ddrs to be tested at run-time by versioning for alias.
3034 Merge several alias checks into one if possible.
3035 Return FALSE if resulting list of ddrs is longer then allowed by
3036 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3038 bool
3039 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3041 vec<ddr_p> may_alias_ddrs =
3042 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3043 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3044 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3045 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3046 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3048 ddr_p ddr;
3049 unsigned int i;
3050 tree length_factor;
3052 if (dump_enabled_p ())
3053 dump_printf_loc (MSG_NOTE, vect_location,
3054 "=== vect_prune_runtime_alias_test_list ===\n");
3056 if (may_alias_ddrs.is_empty ())
3057 return true;
3059 /* Basically, for each pair of dependent data refs store_ptr_0
3060 and load_ptr_0, we create an expression:
3062 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3063 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3065 for aliasing checks. However, in some cases we can decrease
3066 the number of checks by combining two checks into one. For
3067 example, suppose we have another pair of data refs store_ptr_0
3068 and load_ptr_1, and if the following condition is satisfied:
3070 load_ptr_0 < load_ptr_1 &&
3071 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3073 (this condition means, in each iteration of vectorized loop,
3074 the accessed memory of store_ptr_0 cannot be between the memory
3075 of load_ptr_0 and load_ptr_1.)
3077 we then can use only the following expression to finish the
3078 alising checks between store_ptr_0 & load_ptr_0 and
3079 store_ptr_0 & load_ptr_1:
3081 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3082 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3084 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3085 same basic address. */
3087 comp_alias_ddrs.create (may_alias_ddrs.length ());
3089 /* First, we collect all data ref pairs for aliasing checks. */
3090 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3092 int comp_res;
3093 struct data_reference *dr_a, *dr_b;
3094 gimple *dr_group_first_a, *dr_group_first_b;
3095 tree segment_length_a, segment_length_b;
3096 gimple *stmt_a, *stmt_b;
3098 dr_a = DDR_A (ddr);
3099 stmt_a = DR_STMT (DDR_A (ddr));
3100 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3101 if (dr_group_first_a)
3103 stmt_a = dr_group_first_a;
3104 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3107 dr_b = DDR_B (ddr);
3108 stmt_b = DR_STMT (DDR_B (ddr));
3109 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3110 if (dr_group_first_b)
3112 stmt_b = dr_group_first_b;
3113 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3116 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3117 length_factor = scalar_loop_iters;
3118 else
3119 length_factor = size_int (vect_factor);
3120 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3121 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3123 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3124 if (comp_res == 0)
3125 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3127 /* Alias is known at compilation time. */
3128 if (comp_res == 0
3129 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3130 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3131 && TREE_CODE (segment_length_a) == INTEGER_CST
3132 && TREE_CODE (segment_length_b) == INTEGER_CST)
3134 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3135 continue;
3137 if (dump_enabled_p ())
3138 dump_printf_loc (MSG_NOTE, vect_location,
3139 "not vectorized: compilation time alias.\n");
3141 return false;
3144 dr_with_seg_len_pair_t dr_with_seg_len_pair
3145 (dr_with_seg_len (dr_a, segment_length_a),
3146 dr_with_seg_len (dr_b, segment_length_b));
3148 /* Canonicalize pairs by sorting the two DR members. */
3149 if (comp_res > 0)
3150 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3152 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3155 /* Second, we sort the collected data ref pairs so that we can scan
3156 them once to combine all possible aliasing checks. */
3157 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3159 /* Third, we scan the sorted dr pairs and check if we can combine
3160 alias checks of two neighboring dr pairs. */
3161 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3163 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3164 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3165 *dr_b1 = &comp_alias_ddrs[i-1].second,
3166 *dr_a2 = &comp_alias_ddrs[i].first,
3167 *dr_b2 = &comp_alias_ddrs[i].second;
3169 /* Remove duplicate data ref pairs. */
3170 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3172 if (dump_enabled_p ())
3174 dump_printf_loc (MSG_NOTE, vect_location,
3175 "found equal ranges ");
3176 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3177 DR_REF (dr_a1->dr));
3178 dump_printf (MSG_NOTE, ", ");
3179 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3180 DR_REF (dr_b1->dr));
3181 dump_printf (MSG_NOTE, " and ");
3182 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3183 DR_REF (dr_a2->dr));
3184 dump_printf (MSG_NOTE, ", ");
3185 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3186 DR_REF (dr_b2->dr));
3187 dump_printf (MSG_NOTE, "\n");
3190 comp_alias_ddrs.ordered_remove (i--);
3191 continue;
3194 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3196 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3197 and DR_A1 and DR_A2 are two consecutive memrefs. */
3198 if (*dr_a1 == *dr_a2)
3200 std::swap (dr_a1, dr_b1);
3201 std::swap (dr_a2, dr_b2);
3204 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3205 DR_BASE_ADDRESS (dr_a2->dr), 0)
3206 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3207 DR_OFFSET (dr_a2->dr), 0)
3208 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3209 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3210 continue;
3212 /* Make sure dr_a1 starts left of dr_a2. */
3213 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3214 std::swap (*dr_a1, *dr_a2);
3216 bool do_remove = false;
3217 unsigned HOST_WIDE_INT diff
3218 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3219 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3221 /* If the left segment does not extend beyond the start of the
3222 right segment the new segment length is that of the right
3223 plus the segment distance. */
3224 if (tree_fits_uhwi_p (dr_a1->seg_len)
3225 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3227 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3228 size_int (diff));
3229 do_remove = true;
3231 /* Generally the new segment length is the maximum of the
3232 left segment size and the right segment size plus the distance.
3233 ??? We can also build tree MAX_EXPR here but it's not clear this
3234 is profitable. */
3235 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3236 && tree_fits_uhwi_p (dr_a2->seg_len))
3238 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3239 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3240 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3241 do_remove = true;
3243 /* Now we check if the following condition is satisfied:
3245 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3247 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3248 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3249 have to make a best estimation. We can get the minimum value
3250 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3251 then either of the following two conditions can guarantee the
3252 one above:
3254 1: DIFF <= MIN_SEG_LEN_B
3255 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3256 else
3258 unsigned HOST_WIDE_INT min_seg_len_b
3259 = (tree_fits_uhwi_p (dr_b1->seg_len)
3260 ? tree_to_uhwi (dr_b1->seg_len)
3261 : vect_factor);
3263 if (diff <= min_seg_len_b
3264 || (tree_fits_uhwi_p (dr_a1->seg_len)
3265 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3267 dr_a1->seg_len = size_binop (PLUS_EXPR,
3268 dr_a2->seg_len, size_int (diff));
3269 do_remove = true;
3273 if (do_remove)
3275 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_NOTE, vect_location,
3278 "merging ranges for ");
3279 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3280 dump_printf (MSG_NOTE, ", ");
3281 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3282 dump_printf (MSG_NOTE, " and ");
3283 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3284 dump_printf (MSG_NOTE, ", ");
3285 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3286 dump_printf (MSG_NOTE, "\n");
3288 comp_alias_ddrs.ordered_remove (i--);
3293 dump_printf_loc (MSG_NOTE, vect_location,
3294 "improved number of alias checks from %d to %d\n",
3295 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3296 if ((int) comp_alias_ddrs.length () >
3297 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3301 "number of versioning for alias "
3302 "run-time tests exceeds %d "
3303 "(--param vect-max-version-for-alias-checks)\n",
3304 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3305 return false;
3308 /* All alias checks have been resolved at compilation time. */
3309 if (!comp_alias_ddrs.length ())
3310 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3312 return true;
3315 /* Return true if a non-affine read or write in STMT is suitable for a
3316 gather load or scatter store. Describe the operation in *INFO if so. */
3318 bool
3319 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3320 gather_scatter_info *info)
3322 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3323 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3324 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3325 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3326 tree offtype = NULL_TREE;
3327 tree decl, base, off;
3328 machine_mode pmode;
3329 int punsignedp, reversep, pvolatilep = 0;
3331 base = DR_REF (dr);
3332 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3333 see if we can use the def stmt of the address. */
3334 if (is_gimple_call (stmt)
3335 && gimple_call_internal_p (stmt)
3336 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3337 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3338 && TREE_CODE (base) == MEM_REF
3339 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3340 && integer_zerop (TREE_OPERAND (base, 1))
3341 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3343 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3344 if (is_gimple_assign (def_stmt)
3345 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3346 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3349 /* The gather and scatter builtins need address of the form
3350 loop_invariant + vector * {1, 2, 4, 8}
3352 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3353 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3354 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3355 multiplications and additions in it. To get a vector, we need
3356 a single SSA_NAME that will be defined in the loop and will
3357 contain everything that is not loop invariant and that can be
3358 vectorized. The following code attempts to find such a preexistng
3359 SSA_NAME OFF and put the loop invariants into a tree BASE
3360 that can be gimplified before the loop. */
3361 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3362 &punsignedp, &reversep, &pvolatilep);
3363 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3365 if (TREE_CODE (base) == MEM_REF)
3367 if (!integer_zerop (TREE_OPERAND (base, 1)))
3369 if (off == NULL_TREE)
3371 offset_int moff = mem_ref_offset (base);
3372 off = wide_int_to_tree (sizetype, moff);
3374 else
3375 off = size_binop (PLUS_EXPR, off,
3376 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3378 base = TREE_OPERAND (base, 0);
3380 else
3381 base = build_fold_addr_expr (base);
3383 if (off == NULL_TREE)
3384 off = size_zero_node;
3386 /* If base is not loop invariant, either off is 0, then we start with just
3387 the constant offset in the loop invariant BASE and continue with base
3388 as OFF, otherwise give up.
3389 We could handle that case by gimplifying the addition of base + off
3390 into some SSA_NAME and use that as off, but for now punt. */
3391 if (!expr_invariant_in_loop_p (loop, base))
3393 if (!integer_zerop (off))
3394 return false;
3395 off = base;
3396 base = size_int (pbitpos / BITS_PER_UNIT);
3398 /* Otherwise put base + constant offset into the loop invariant BASE
3399 and continue with OFF. */
3400 else
3402 base = fold_convert (sizetype, base);
3403 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3406 /* OFF at this point may be either a SSA_NAME or some tree expression
3407 from get_inner_reference. Try to peel off loop invariants from it
3408 into BASE as long as possible. */
3409 STRIP_NOPS (off);
3410 while (offtype == NULL_TREE)
3412 enum tree_code code;
3413 tree op0, op1, add = NULL_TREE;
3415 if (TREE_CODE (off) == SSA_NAME)
3417 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3419 if (expr_invariant_in_loop_p (loop, off))
3420 return false;
3422 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3423 break;
3425 op0 = gimple_assign_rhs1 (def_stmt);
3426 code = gimple_assign_rhs_code (def_stmt);
3427 op1 = gimple_assign_rhs2 (def_stmt);
3429 else
3431 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3432 return false;
3433 code = TREE_CODE (off);
3434 extract_ops_from_tree (off, &code, &op0, &op1);
3436 switch (code)
3438 case POINTER_PLUS_EXPR:
3439 case PLUS_EXPR:
3440 if (expr_invariant_in_loop_p (loop, op0))
3442 add = op0;
3443 off = op1;
3444 do_add:
3445 add = fold_convert (sizetype, add);
3446 if (scale != 1)
3447 add = size_binop (MULT_EXPR, add, size_int (scale));
3448 base = size_binop (PLUS_EXPR, base, add);
3449 continue;
3451 if (expr_invariant_in_loop_p (loop, op1))
3453 add = op1;
3454 off = op0;
3455 goto do_add;
3457 break;
3458 case MINUS_EXPR:
3459 if (expr_invariant_in_loop_p (loop, op1))
3461 add = fold_convert (sizetype, op1);
3462 add = size_binop (MINUS_EXPR, size_zero_node, add);
3463 off = op0;
3464 goto do_add;
3466 break;
3467 case MULT_EXPR:
3468 if (scale == 1 && tree_fits_shwi_p (op1))
3470 scale = tree_to_shwi (op1);
3471 off = op0;
3472 continue;
3474 break;
3475 case SSA_NAME:
3476 off = op0;
3477 continue;
3478 CASE_CONVERT:
3479 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3480 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3481 break;
3482 if (TYPE_PRECISION (TREE_TYPE (op0))
3483 == TYPE_PRECISION (TREE_TYPE (off)))
3485 off = op0;
3486 continue;
3488 if (TYPE_PRECISION (TREE_TYPE (op0))
3489 < TYPE_PRECISION (TREE_TYPE (off)))
3491 off = op0;
3492 offtype = TREE_TYPE (off);
3493 STRIP_NOPS (off);
3494 continue;
3496 break;
3497 default:
3498 break;
3500 break;
3503 /* If at the end OFF still isn't a SSA_NAME or isn't
3504 defined in the loop, punt. */
3505 if (TREE_CODE (off) != SSA_NAME
3506 || expr_invariant_in_loop_p (loop, off))
3507 return false;
3509 if (offtype == NULL_TREE)
3510 offtype = TREE_TYPE (off);
3512 if (DR_IS_READ (dr))
3513 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3514 offtype, scale);
3515 else
3516 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3517 offtype, scale);
3519 if (decl == NULL_TREE)
3520 return false;
3522 info->decl = decl;
3523 info->base = base;
3524 info->offset = off;
3525 info->offset_dt = vect_unknown_def_type;
3526 info->offset_vectype = NULL_TREE;
3527 info->scale = scale;
3528 return true;
3531 /* Function vect_analyze_data_refs.
3533 Find all the data references in the loop or basic block.
3535 The general structure of the analysis of data refs in the vectorizer is as
3536 follows:
3537 1- vect_analyze_data_refs(loop/bb): call
3538 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3539 in the loop/bb and their dependences.
3540 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3541 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3542 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3546 bool
3547 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3549 struct loop *loop = NULL;
3550 unsigned int i;
3551 struct data_reference *dr;
3552 tree scalar_type;
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE, vect_location,
3556 "=== vect_analyze_data_refs ===\n");
3558 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3559 loop = LOOP_VINFO_LOOP (loop_vinfo);
3561 /* Go through the data-refs, check that the analysis succeeded. Update
3562 pointer from stmt_vec_info struct to DR and vectype. */
3564 vec<data_reference_p> datarefs = vinfo->datarefs;
3565 FOR_EACH_VEC_ELT (datarefs, i, dr)
3567 gimple *stmt;
3568 stmt_vec_info stmt_info;
3569 tree base, offset, init;
3570 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3571 bool simd_lane_access = false;
3572 int vf;
3574 again:
3575 if (!dr || !DR_REF (dr))
3577 if (dump_enabled_p ())
3578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3579 "not vectorized: unhandled data-ref\n");
3580 return false;
3583 stmt = DR_STMT (dr);
3584 stmt_info = vinfo_for_stmt (stmt);
3586 /* Discard clobbers from the dataref vector. We will remove
3587 clobber stmts during vectorization. */
3588 if (gimple_clobber_p (stmt))
3590 free_data_ref (dr);
3591 if (i == datarefs.length () - 1)
3593 datarefs.pop ();
3594 break;
3596 datarefs.ordered_remove (i);
3597 dr = datarefs[i];
3598 goto again;
3601 /* Check that analysis of the data-ref succeeded. */
3602 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3603 || !DR_STEP (dr))
3605 bool maybe_gather
3606 = DR_IS_READ (dr)
3607 && !TREE_THIS_VOLATILE (DR_REF (dr))
3608 && targetm.vectorize.builtin_gather != NULL;
3609 bool maybe_scatter
3610 = DR_IS_WRITE (dr)
3611 && !TREE_THIS_VOLATILE (DR_REF (dr))
3612 && targetm.vectorize.builtin_scatter != NULL;
3613 bool maybe_simd_lane_access
3614 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3616 /* If target supports vector gather loads or scatter stores, or if
3617 this might be a SIMD lane access, see if they can't be used. */
3618 if (is_a <loop_vec_info> (vinfo)
3619 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3620 && !nested_in_vect_loop_p (loop, stmt))
3622 struct data_reference *newdr
3623 = create_data_ref (NULL, loop_containing_stmt (stmt),
3624 DR_REF (dr), stmt, maybe_scatter ? false : true);
3625 gcc_assert (newdr != NULL && DR_REF (newdr));
3626 if (DR_BASE_ADDRESS (newdr)
3627 && DR_OFFSET (newdr)
3628 && DR_INIT (newdr)
3629 && DR_STEP (newdr)
3630 && integer_zerop (DR_STEP (newdr)))
3632 if (maybe_simd_lane_access)
3634 tree off = DR_OFFSET (newdr);
3635 STRIP_NOPS (off);
3636 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3637 && TREE_CODE (off) == MULT_EXPR
3638 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3640 tree step = TREE_OPERAND (off, 1);
3641 off = TREE_OPERAND (off, 0);
3642 STRIP_NOPS (off);
3643 if (CONVERT_EXPR_P (off)
3644 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3645 0)))
3646 < TYPE_PRECISION (TREE_TYPE (off)))
3647 off = TREE_OPERAND (off, 0);
3648 if (TREE_CODE (off) == SSA_NAME)
3650 gimple *def = SSA_NAME_DEF_STMT (off);
3651 tree reft = TREE_TYPE (DR_REF (newdr));
3652 if (is_gimple_call (def)
3653 && gimple_call_internal_p (def)
3654 && (gimple_call_internal_fn (def)
3655 == IFN_GOMP_SIMD_LANE))
3657 tree arg = gimple_call_arg (def, 0);
3658 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3659 arg = SSA_NAME_VAR (arg);
3660 if (arg == loop->simduid
3661 /* For now. */
3662 && tree_int_cst_equal
3663 (TYPE_SIZE_UNIT (reft),
3664 step))
3666 DR_OFFSET (newdr) = ssize_int (0);
3667 DR_STEP (newdr) = step;
3668 DR_ALIGNED_TO (newdr)
3669 = size_int (BIGGEST_ALIGNMENT);
3670 dr = newdr;
3671 simd_lane_access = true;
3677 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3679 dr = newdr;
3680 if (maybe_gather)
3681 gatherscatter = GATHER;
3682 else
3683 gatherscatter = SCATTER;
3686 if (gatherscatter == SG_NONE && !simd_lane_access)
3687 free_data_ref (newdr);
3690 if (gatherscatter == SG_NONE && !simd_lane_access)
3692 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3695 "not vectorized: data ref analysis "
3696 "failed ");
3697 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3700 if (is_a <bb_vec_info> (vinfo))
3701 break;
3703 return false;
3707 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3709 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3711 "not vectorized: base addr of dr is a "
3712 "constant\n");
3714 if (is_a <bb_vec_info> (vinfo))
3715 break;
3717 if (gatherscatter != SG_NONE || simd_lane_access)
3718 free_data_ref (dr);
3719 return false;
3722 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3724 if (dump_enabled_p ())
3726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3727 "not vectorized: volatile type ");
3728 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3731 if (is_a <bb_vec_info> (vinfo))
3732 break;
3734 return false;
3737 if (stmt_can_throw_internal (stmt))
3739 if (dump_enabled_p ())
3741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3742 "not vectorized: statement can throw an "
3743 "exception ");
3744 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3747 if (is_a <bb_vec_info> (vinfo))
3748 break;
3750 if (gatherscatter != SG_NONE || simd_lane_access)
3751 free_data_ref (dr);
3752 return false;
3755 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3756 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3758 if (dump_enabled_p ())
3760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3761 "not vectorized: statement is bitfield "
3762 "access ");
3763 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3766 if (is_a <bb_vec_info> (vinfo))
3767 break;
3769 if (gatherscatter != SG_NONE || simd_lane_access)
3770 free_data_ref (dr);
3771 return false;
3774 base = unshare_expr (DR_BASE_ADDRESS (dr));
3775 offset = unshare_expr (DR_OFFSET (dr));
3776 init = unshare_expr (DR_INIT (dr));
3778 if (is_gimple_call (stmt)
3779 && (!gimple_call_internal_p (stmt)
3780 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3781 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3783 if (dump_enabled_p ())
3785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3786 "not vectorized: dr in a call ");
3787 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3790 if (is_a <bb_vec_info> (vinfo))
3791 break;
3793 if (gatherscatter != SG_NONE || simd_lane_access)
3794 free_data_ref (dr);
3795 return false;
3798 /* Update DR field in stmt_vec_info struct. */
3800 /* If the dataref is in an inner-loop of the loop that is considered for
3801 for vectorization, we also want to analyze the access relative to
3802 the outer-loop (DR contains information only relative to the
3803 inner-most enclosing loop). We do that by building a reference to the
3804 first location accessed by the inner-loop, and analyze it relative to
3805 the outer-loop. */
3806 if (loop && nested_in_vect_loop_p (loop, stmt))
3808 tree outer_step, outer_base, outer_init;
3809 HOST_WIDE_INT pbitsize, pbitpos;
3810 tree poffset;
3811 machine_mode pmode;
3812 int punsignedp, preversep, pvolatilep;
3813 affine_iv base_iv, offset_iv;
3814 tree dinit;
3816 /* Build a reference to the first location accessed by the
3817 inner-loop: *(BASE+INIT). (The first location is actually
3818 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3819 tree inner_base = build_fold_indirect_ref
3820 (fold_build_pointer_plus (base, init));
3822 if (dump_enabled_p ())
3824 dump_printf_loc (MSG_NOTE, vect_location,
3825 "analyze in outer-loop: ");
3826 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3827 dump_printf (MSG_NOTE, "\n");
3830 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3831 &poffset, &pmode, &punsignedp,
3832 &preversep, &pvolatilep);
3833 gcc_assert (outer_base != NULL_TREE);
3835 if (pbitpos % BITS_PER_UNIT != 0)
3837 if (dump_enabled_p ())
3838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3839 "failed: bit offset alignment.\n");
3840 return false;
3843 if (preversep)
3845 if (dump_enabled_p ())
3846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3847 "failed: reverse storage order.\n");
3848 return false;
3851 outer_base = build_fold_addr_expr (outer_base);
3852 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3853 &base_iv, false))
3855 if (dump_enabled_p ())
3856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3857 "failed: evolution of base is not affine.\n");
3858 return false;
3861 if (offset)
3863 if (poffset)
3864 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3865 poffset);
3866 else
3867 poffset = offset;
3870 if (!poffset)
3872 offset_iv.base = ssize_int (0);
3873 offset_iv.step = ssize_int (0);
3875 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3876 &offset_iv, false))
3878 if (dump_enabled_p ())
3879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3880 "evolution of offset is not affine.\n");
3881 return false;
3884 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3885 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3886 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3887 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3888 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3890 outer_step = size_binop (PLUS_EXPR,
3891 fold_convert (ssizetype, base_iv.step),
3892 fold_convert (ssizetype, offset_iv.step));
3894 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3895 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3896 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3897 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3898 STMT_VINFO_DR_OFFSET (stmt_info) =
3899 fold_convert (ssizetype, offset_iv.base);
3900 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3901 size_int (highest_pow2_factor (offset_iv.base));
3903 if (dump_enabled_p ())
3905 dump_printf_loc (MSG_NOTE, vect_location,
3906 "\touter base_address: ");
3907 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3908 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3909 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3910 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3911 STMT_VINFO_DR_OFFSET (stmt_info));
3912 dump_printf (MSG_NOTE,
3913 "\n\touter constant offset from base address: ");
3914 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3915 STMT_VINFO_DR_INIT (stmt_info));
3916 dump_printf (MSG_NOTE, "\n\touter step: ");
3917 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3918 STMT_VINFO_DR_STEP (stmt_info));
3919 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3920 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3921 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3922 dump_printf (MSG_NOTE, "\n");
3926 if (STMT_VINFO_DATA_REF (stmt_info))
3928 if (dump_enabled_p ())
3930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3931 "not vectorized: more than one data ref "
3932 "in stmt: ");
3933 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3936 if (is_a <bb_vec_info> (vinfo))
3937 break;
3939 if (gatherscatter != SG_NONE || simd_lane_access)
3940 free_data_ref (dr);
3941 return false;
3944 STMT_VINFO_DATA_REF (stmt_info) = dr;
3945 if (simd_lane_access)
3947 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3948 free_data_ref (datarefs[i]);
3949 datarefs[i] = dr;
3952 /* Set vectype for STMT. */
3953 scalar_type = TREE_TYPE (DR_REF (dr));
3954 STMT_VINFO_VECTYPE (stmt_info)
3955 = get_vectype_for_scalar_type (scalar_type);
3956 if (!STMT_VINFO_VECTYPE (stmt_info))
3958 if (dump_enabled_p ())
3960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3961 "not vectorized: no vectype for stmt: ");
3962 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3963 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3964 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3965 scalar_type);
3966 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3969 if (is_a <bb_vec_info> (vinfo))
3971 /* No vector type is fine, the ref can still participate
3972 in dependence analysis, we just can't vectorize it. */
3973 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3974 continue;
3977 if (gatherscatter != SG_NONE || simd_lane_access)
3979 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3980 if (gatherscatter != SG_NONE)
3981 free_data_ref (dr);
3983 return false;
3985 else
3987 if (dump_enabled_p ())
3989 dump_printf_loc (MSG_NOTE, vect_location,
3990 "got vectype for stmt: ");
3991 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3992 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3993 STMT_VINFO_VECTYPE (stmt_info));
3994 dump_printf (MSG_NOTE, "\n");
3998 /* Adjust the minimal vectorization factor according to the
3999 vector type. */
4000 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4001 if (vf > *min_vf)
4002 *min_vf = vf;
4004 if (gatherscatter != SG_NONE)
4006 gather_scatter_info gs_info;
4007 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4008 &gs_info)
4009 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4011 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4012 free_data_ref (dr);
4013 if (dump_enabled_p ())
4015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4016 (gatherscatter == GATHER) ?
4017 "not vectorized: not suitable for gather "
4018 "load " :
4019 "not vectorized: not suitable for scatter "
4020 "store ");
4021 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4023 return false;
4026 free_data_ref (datarefs[i]);
4027 datarefs[i] = dr;
4028 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4031 else if (is_a <loop_vec_info> (vinfo)
4032 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4034 if (nested_in_vect_loop_p (loop, stmt))
4036 if (dump_enabled_p ())
4038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4039 "not vectorized: not suitable for strided "
4040 "load ");
4041 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4043 return false;
4045 STMT_VINFO_STRIDED_P (stmt_info) = true;
4049 /* If we stopped analysis at the first dataref we could not analyze
4050 when trying to vectorize a basic-block mark the rest of the datarefs
4051 as not vectorizable and truncate the vector of datarefs. That
4052 avoids spending useless time in analyzing their dependence. */
4053 if (i != datarefs.length ())
4055 gcc_assert (is_a <bb_vec_info> (vinfo));
4056 for (unsigned j = i; j < datarefs.length (); ++j)
4058 data_reference_p dr = datarefs[j];
4059 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4060 free_data_ref (dr);
4062 datarefs.truncate (i);
4065 return true;
4069 /* Function vect_get_new_vect_var.
4071 Returns a name for a new variable. The current naming scheme appends the
4072 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4073 the name of vectorizer generated variables, and appends that to NAME if
4074 provided. */
4076 tree
4077 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4079 const char *prefix;
4080 tree new_vect_var;
4082 switch (var_kind)
4084 case vect_simple_var:
4085 prefix = "vect";
4086 break;
4087 case vect_scalar_var:
4088 prefix = "stmp";
4089 break;
4090 case vect_mask_var:
4091 prefix = "mask";
4092 break;
4093 case vect_pointer_var:
4094 prefix = "vectp";
4095 break;
4096 default:
4097 gcc_unreachable ();
4100 if (name)
4102 char* tmp = concat (prefix, "_", name, NULL);
4103 new_vect_var = create_tmp_reg (type, tmp);
4104 free (tmp);
4106 else
4107 new_vect_var = create_tmp_reg (type, prefix);
4109 return new_vect_var;
4112 /* Like vect_get_new_vect_var but return an SSA name. */
4114 tree
4115 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4117 const char *prefix;
4118 tree new_vect_var;
4120 switch (var_kind)
4122 case vect_simple_var:
4123 prefix = "vect";
4124 break;
4125 case vect_scalar_var:
4126 prefix = "stmp";
4127 break;
4128 case vect_pointer_var:
4129 prefix = "vectp";
4130 break;
4131 default:
4132 gcc_unreachable ();
4135 if (name)
4137 char* tmp = concat (prefix, "_", name, NULL);
4138 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4139 free (tmp);
4141 else
4142 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4144 return new_vect_var;
4147 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4149 static void
4150 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4151 stmt_vec_info stmt_info)
4153 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4154 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4155 int misalign = DR_MISALIGNMENT (dr);
4156 if (misalign == -1)
4157 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4158 else
4159 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4162 /* Function vect_create_addr_base_for_vector_ref.
4164 Create an expression that computes the address of the first memory location
4165 that will be accessed for a data reference.
4167 Input:
4168 STMT: The statement containing the data reference.
4169 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4170 OFFSET: Optional. If supplied, it is be added to the initial address.
4171 LOOP: Specify relative to which loop-nest should the address be computed.
4172 For example, when the dataref is in an inner-loop nested in an
4173 outer-loop that is now being vectorized, LOOP can be either the
4174 outer-loop, or the inner-loop. The first memory location accessed
4175 by the following dataref ('in' points to short):
4177 for (i=0; i<N; i++)
4178 for (j=0; j<M; j++)
4179 s += in[i+j]
4181 is as follows:
4182 if LOOP=i_loop: &in (relative to i_loop)
4183 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4184 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4185 initial address. Unlike OFFSET, which is number of elements to
4186 be added, BYTE_OFFSET is measured in bytes.
4188 Output:
4189 1. Return an SSA_NAME whose value is the address of the memory location of
4190 the first vector of the data reference.
4191 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4192 these statement(s) which define the returned SSA_NAME.
4194 FORNOW: We are only handling array accesses with step 1. */
4196 tree
4197 vect_create_addr_base_for_vector_ref (gimple *stmt,
4198 gimple_seq *new_stmt_list,
4199 tree offset,
4200 struct loop *loop,
4201 tree byte_offset)
4203 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4204 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4205 tree data_ref_base;
4206 const char *base_name;
4207 tree addr_base;
4208 tree dest;
4209 gimple_seq seq = NULL;
4210 tree base_offset;
4211 tree init;
4212 tree vect_ptr_type;
4213 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4214 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4216 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4218 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4220 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4222 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4223 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4224 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4226 else
4228 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4229 base_offset = unshare_expr (DR_OFFSET (dr));
4230 init = unshare_expr (DR_INIT (dr));
4233 if (loop_vinfo)
4234 base_name = get_name (data_ref_base);
4235 else
4237 base_offset = ssize_int (0);
4238 init = ssize_int (0);
4239 base_name = get_name (DR_REF (dr));
4242 /* Create base_offset */
4243 base_offset = size_binop (PLUS_EXPR,
4244 fold_convert (sizetype, base_offset),
4245 fold_convert (sizetype, init));
4247 if (offset)
4249 offset = fold_build2 (MULT_EXPR, sizetype,
4250 fold_convert (sizetype, offset), step);
4251 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4252 base_offset, offset);
4254 if (byte_offset)
4256 byte_offset = fold_convert (sizetype, byte_offset);
4257 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4258 base_offset, byte_offset);
4261 /* base + base_offset */
4262 if (loop_vinfo)
4263 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4264 else
4266 addr_base = build1 (ADDR_EXPR,
4267 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4268 unshare_expr (DR_REF (dr)));
4271 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4272 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4273 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4274 gimple_seq_add_seq (new_stmt_list, seq);
4276 if (DR_PTR_INFO (dr)
4277 && TREE_CODE (addr_base) == SSA_NAME
4278 && !SSA_NAME_PTR_INFO (addr_base))
4280 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4281 if (offset || byte_offset)
4282 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4285 if (dump_enabled_p ())
4287 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4288 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4289 dump_printf (MSG_NOTE, "\n");
4292 return addr_base;
4296 /* Function vect_create_data_ref_ptr.
4298 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4299 location accessed in the loop by STMT, along with the def-use update
4300 chain to appropriately advance the pointer through the loop iterations.
4301 Also set aliasing information for the pointer. This pointer is used by
4302 the callers to this function to create a memory reference expression for
4303 vector load/store access.
4305 Input:
4306 1. STMT: a stmt that references memory. Expected to be of the form
4307 GIMPLE_ASSIGN <name, data-ref> or
4308 GIMPLE_ASSIGN <data-ref, name>.
4309 2. AGGR_TYPE: the type of the reference, which should be either a vector
4310 or an array.
4311 3. AT_LOOP: the loop where the vector memref is to be created.
4312 4. OFFSET (optional): an offset to be added to the initial address accessed
4313 by the data-ref in STMT.
4314 5. BSI: location where the new stmts are to be placed if there is no loop
4315 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4316 pointing to the initial address.
4317 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4318 to the initial address accessed by the data-ref in STMT. This is
4319 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4320 in bytes.
4322 Output:
4323 1. Declare a new ptr to vector_type, and have it point to the base of the
4324 data reference (initial addressed accessed by the data reference).
4325 For example, for vector of type V8HI, the following code is generated:
4327 v8hi *ap;
4328 ap = (v8hi *)initial_address;
4330 if OFFSET is not supplied:
4331 initial_address = &a[init];
4332 if OFFSET is supplied:
4333 initial_address = &a[init + OFFSET];
4334 if BYTE_OFFSET is supplied:
4335 initial_address = &a[init] + BYTE_OFFSET;
4337 Return the initial_address in INITIAL_ADDRESS.
4339 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4340 update the pointer in each iteration of the loop.
4342 Return the increment stmt that updates the pointer in PTR_INCR.
4344 3. Set INV_P to true if the access pattern of the data reference in the
4345 vectorized loop is invariant. Set it to false otherwise.
4347 4. Return the pointer. */
4349 tree
4350 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4351 tree offset, tree *initial_address,
4352 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4353 bool only_init, bool *inv_p, tree byte_offset)
4355 const char *base_name;
4356 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4357 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4358 struct loop *loop = NULL;
4359 bool nested_in_vect_loop = false;
4360 struct loop *containing_loop = NULL;
4361 tree aggr_ptr_type;
4362 tree aggr_ptr;
4363 tree new_temp;
4364 gimple_seq new_stmt_list = NULL;
4365 edge pe = NULL;
4366 basic_block new_bb;
4367 tree aggr_ptr_init;
4368 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4369 tree aptr;
4370 gimple_stmt_iterator incr_gsi;
4371 bool insert_after;
4372 tree indx_before_incr, indx_after_incr;
4373 gimple *incr;
4374 tree step;
4375 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4377 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4378 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4380 if (loop_vinfo)
4382 loop = LOOP_VINFO_LOOP (loop_vinfo);
4383 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4384 containing_loop = (gimple_bb (stmt))->loop_father;
4385 pe = loop_preheader_edge (loop);
4387 else
4389 gcc_assert (bb_vinfo);
4390 only_init = true;
4391 *ptr_incr = NULL;
4394 /* Check the step (evolution) of the load in LOOP, and record
4395 whether it's invariant. */
4396 if (nested_in_vect_loop)
4397 step = STMT_VINFO_DR_STEP (stmt_info);
4398 else
4399 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4401 if (integer_zerop (step))
4402 *inv_p = true;
4403 else
4404 *inv_p = false;
4406 /* Create an expression for the first address accessed by this load
4407 in LOOP. */
4408 base_name = get_name (DR_BASE_ADDRESS (dr));
4410 if (dump_enabled_p ())
4412 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4413 dump_printf_loc (MSG_NOTE, vect_location,
4414 "create %s-pointer variable to type: ",
4415 get_tree_code_name (TREE_CODE (aggr_type)));
4416 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4417 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4418 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4419 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4420 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4421 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4422 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4423 else
4424 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4425 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4426 dump_printf (MSG_NOTE, "\n");
4429 /* (1) Create the new aggregate-pointer variable.
4430 Vector and array types inherit the alias set of their component
4431 type by default so we need to use a ref-all pointer if the data
4432 reference does not conflict with the created aggregated data
4433 reference because it is not addressable. */
4434 bool need_ref_all = false;
4435 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4436 get_alias_set (DR_REF (dr))))
4437 need_ref_all = true;
4438 /* Likewise for any of the data references in the stmt group. */
4439 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4441 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4444 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4445 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4446 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4447 get_alias_set (DR_REF (sdr))))
4449 need_ref_all = true;
4450 break;
4452 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4454 while (orig_stmt);
4456 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4457 need_ref_all);
4458 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4461 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4462 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4463 def-use update cycles for the pointer: one relative to the outer-loop
4464 (LOOP), which is what steps (3) and (4) below do. The other is relative
4465 to the inner-loop (which is the inner-most loop containing the dataref),
4466 and this is done be step (5) below.
4468 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4469 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4470 redundant. Steps (3),(4) create the following:
4472 vp0 = &base_addr;
4473 LOOP: vp1 = phi(vp0,vp2)
4476 vp2 = vp1 + step
4477 goto LOOP
4479 If there is an inner-loop nested in loop, then step (5) will also be
4480 applied, and an additional update in the inner-loop will be created:
4482 vp0 = &base_addr;
4483 LOOP: vp1 = phi(vp0,vp2)
4485 inner: vp3 = phi(vp1,vp4)
4486 vp4 = vp3 + inner_step
4487 if () goto inner
4489 vp2 = vp1 + step
4490 if () goto LOOP */
4492 /* (2) Calculate the initial address of the aggregate-pointer, and set
4493 the aggregate-pointer to point to it before the loop. */
4495 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4497 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4498 offset, loop, byte_offset);
4499 if (new_stmt_list)
4501 if (pe)
4503 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4504 gcc_assert (!new_bb);
4506 else
4507 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4510 *initial_address = new_temp;
4511 aggr_ptr_init = new_temp;
4513 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4514 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4515 inner-loop nested in LOOP (during outer-loop vectorization). */
4517 /* No update in loop is required. */
4518 if (only_init && (!loop_vinfo || at_loop == loop))
4519 aptr = aggr_ptr_init;
4520 else
4522 /* The step of the aggregate pointer is the type size. */
4523 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4524 /* One exception to the above is when the scalar step of the load in
4525 LOOP is zero. In this case the step here is also zero. */
4526 if (*inv_p)
4527 iv_step = size_zero_node;
4528 else if (tree_int_cst_sgn (step) == -1)
4529 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4531 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4533 create_iv (aggr_ptr_init,
4534 fold_convert (aggr_ptr_type, iv_step),
4535 aggr_ptr, loop, &incr_gsi, insert_after,
4536 &indx_before_incr, &indx_after_incr);
4537 incr = gsi_stmt (incr_gsi);
4538 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4540 /* Copy the points-to information if it exists. */
4541 if (DR_PTR_INFO (dr))
4543 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4544 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4546 if (ptr_incr)
4547 *ptr_incr = incr;
4549 aptr = indx_before_incr;
4552 if (!nested_in_vect_loop || only_init)
4553 return aptr;
4556 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4557 nested in LOOP, if exists. */
4559 gcc_assert (nested_in_vect_loop);
4560 if (!only_init)
4562 standard_iv_increment_position (containing_loop, &incr_gsi,
4563 &insert_after);
4564 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4565 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4566 &indx_after_incr);
4567 incr = gsi_stmt (incr_gsi);
4568 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4570 /* Copy the points-to information if it exists. */
4571 if (DR_PTR_INFO (dr))
4573 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4574 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4576 if (ptr_incr)
4577 *ptr_incr = incr;
4579 return indx_before_incr;
4581 else
4582 gcc_unreachable ();
4586 /* Function bump_vector_ptr
4588 Increment a pointer (to a vector type) by vector-size. If requested,
4589 i.e. if PTR-INCR is given, then also connect the new increment stmt
4590 to the existing def-use update-chain of the pointer, by modifying
4591 the PTR_INCR as illustrated below:
4593 The pointer def-use update-chain before this function:
4594 DATAREF_PTR = phi (p_0, p_2)
4595 ....
4596 PTR_INCR: p_2 = DATAREF_PTR + step
4598 The pointer def-use update-chain after this function:
4599 DATAREF_PTR = phi (p_0, p_2)
4600 ....
4601 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4602 ....
4603 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4605 Input:
4606 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4607 in the loop.
4608 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4609 the loop. The increment amount across iterations is expected
4610 to be vector_size.
4611 BSI - location where the new update stmt is to be placed.
4612 STMT - the original scalar memory-access stmt that is being vectorized.
4613 BUMP - optional. The offset by which to bump the pointer. If not given,
4614 the offset is assumed to be vector_size.
4616 Output: Return NEW_DATAREF_PTR as illustrated above.
4620 tree
4621 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4622 gimple *stmt, tree bump)
4624 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4625 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4626 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4627 tree update = TYPE_SIZE_UNIT (vectype);
4628 gassign *incr_stmt;
4629 ssa_op_iter iter;
4630 use_operand_p use_p;
4631 tree new_dataref_ptr;
4633 if (bump)
4634 update = bump;
4636 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4637 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4638 else
4639 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4640 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4641 dataref_ptr, update);
4642 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4644 /* Copy the points-to information if it exists. */
4645 if (DR_PTR_INFO (dr))
4647 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4648 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4651 if (!ptr_incr)
4652 return new_dataref_ptr;
4654 /* Update the vector-pointer's cross-iteration increment. */
4655 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4657 tree use = USE_FROM_PTR (use_p);
4659 if (use == dataref_ptr)
4660 SET_USE (use_p, new_dataref_ptr);
4661 else
4662 gcc_assert (tree_int_cst_compare (use, update) == 0);
4665 return new_dataref_ptr;
4669 /* Function vect_create_destination_var.
4671 Create a new temporary of type VECTYPE. */
4673 tree
4674 vect_create_destination_var (tree scalar_dest, tree vectype)
4676 tree vec_dest;
4677 const char *name;
4678 char *new_name;
4679 tree type;
4680 enum vect_var_kind kind;
4682 kind = vectype
4683 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4684 ? vect_mask_var
4685 : vect_simple_var
4686 : vect_scalar_var;
4687 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4689 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4691 name = get_name (scalar_dest);
4692 if (name)
4693 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4694 else
4695 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4696 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4697 free (new_name);
4699 return vec_dest;
4702 /* Function vect_grouped_store_supported.
4704 Returns TRUE if interleave high and interleave low permutations
4705 are supported, and FALSE otherwise. */
4707 bool
4708 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4710 machine_mode mode = TYPE_MODE (vectype);
4712 /* vect_permute_store_chain requires the group size to be equal to 3 or
4713 be a power of two. */
4714 if (count != 3 && exact_log2 (count) == -1)
4716 if (dump_enabled_p ())
4717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4718 "the size of the group of accesses"
4719 " is not a power of 2 or not eqaul to 3\n");
4720 return false;
4723 /* Check that the permutation is supported. */
4724 if (VECTOR_MODE_P (mode))
4726 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4727 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4729 if (count == 3)
4731 unsigned int j0 = 0, j1 = 0, j2 = 0;
4732 unsigned int i, j;
4734 for (j = 0; j < 3; j++)
4736 int nelt0 = ((3 - j) * nelt) % 3;
4737 int nelt1 = ((3 - j) * nelt + 1) % 3;
4738 int nelt2 = ((3 - j) * nelt + 2) % 3;
4739 for (i = 0; i < nelt; i++)
4741 if (3 * i + nelt0 < nelt)
4742 sel[3 * i + nelt0] = j0++;
4743 if (3 * i + nelt1 < nelt)
4744 sel[3 * i + nelt1] = nelt + j1++;
4745 if (3 * i + nelt2 < nelt)
4746 sel[3 * i + nelt2] = 0;
4748 if (!can_vec_perm_p (mode, false, sel))
4750 if (dump_enabled_p ())
4751 dump_printf (MSG_MISSED_OPTIMIZATION,
4752 "permutaion op not supported by target.\n");
4753 return false;
4756 for (i = 0; i < nelt; i++)
4758 if (3 * i + nelt0 < nelt)
4759 sel[3 * i + nelt0] = 3 * i + nelt0;
4760 if (3 * i + nelt1 < nelt)
4761 sel[3 * i + nelt1] = 3 * i + nelt1;
4762 if (3 * i + nelt2 < nelt)
4763 sel[3 * i + nelt2] = nelt + j2++;
4765 if (!can_vec_perm_p (mode, false, sel))
4767 if (dump_enabled_p ())
4768 dump_printf (MSG_MISSED_OPTIMIZATION,
4769 "permutaion op not supported by target.\n");
4770 return false;
4773 return true;
4775 else
4777 /* If length is not equal to 3 then only power of 2 is supported. */
4778 gcc_assert (pow2p_hwi (count));
4780 for (i = 0; i < nelt / 2; i++)
4782 sel[i * 2] = i;
4783 sel[i * 2 + 1] = i + nelt;
4785 if (can_vec_perm_p (mode, false, sel))
4787 for (i = 0; i < nelt; i++)
4788 sel[i] += nelt / 2;
4789 if (can_vec_perm_p (mode, false, sel))
4790 return true;
4795 if (dump_enabled_p ())
4796 dump_printf (MSG_MISSED_OPTIMIZATION,
4797 "permutaion op not supported by target.\n");
4798 return false;
4802 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4803 type VECTYPE. */
4805 bool
4806 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4808 return vect_lanes_optab_supported_p ("vec_store_lanes",
4809 vec_store_lanes_optab,
4810 vectype, count);
4814 /* Function vect_permute_store_chain.
4816 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4817 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4818 the data correctly for the stores. Return the final references for stores
4819 in RESULT_CHAIN.
4821 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4822 The input is 4 vectors each containing 8 elements. We assign a number to
4823 each element, the input sequence is:
4825 1st vec: 0 1 2 3 4 5 6 7
4826 2nd vec: 8 9 10 11 12 13 14 15
4827 3rd vec: 16 17 18 19 20 21 22 23
4828 4th vec: 24 25 26 27 28 29 30 31
4830 The output sequence should be:
4832 1st vec: 0 8 16 24 1 9 17 25
4833 2nd vec: 2 10 18 26 3 11 19 27
4834 3rd vec: 4 12 20 28 5 13 21 30
4835 4th vec: 6 14 22 30 7 15 23 31
4837 i.e., we interleave the contents of the four vectors in their order.
4839 We use interleave_high/low instructions to create such output. The input of
4840 each interleave_high/low operation is two vectors:
4841 1st vec 2nd vec
4842 0 1 2 3 4 5 6 7
4843 the even elements of the result vector are obtained left-to-right from the
4844 high/low elements of the first vector. The odd elements of the result are
4845 obtained left-to-right from the high/low elements of the second vector.
4846 The output of interleave_high will be: 0 4 1 5
4847 and of interleave_low: 2 6 3 7
4850 The permutation is done in log LENGTH stages. In each stage interleave_high
4851 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4852 where the first argument is taken from the first half of DR_CHAIN and the
4853 second argument from it's second half.
4854 In our example,
4856 I1: interleave_high (1st vec, 3rd vec)
4857 I2: interleave_low (1st vec, 3rd vec)
4858 I3: interleave_high (2nd vec, 4th vec)
4859 I4: interleave_low (2nd vec, 4th vec)
4861 The output for the first stage is:
4863 I1: 0 16 1 17 2 18 3 19
4864 I2: 4 20 5 21 6 22 7 23
4865 I3: 8 24 9 25 10 26 11 27
4866 I4: 12 28 13 29 14 30 15 31
4868 The output of the second stage, i.e. the final result is:
4870 I1: 0 8 16 24 1 9 17 25
4871 I2: 2 10 18 26 3 11 19 27
4872 I3: 4 12 20 28 5 13 21 30
4873 I4: 6 14 22 30 7 15 23 31. */
4875 void
4876 vect_permute_store_chain (vec<tree> dr_chain,
4877 unsigned int length,
4878 gimple *stmt,
4879 gimple_stmt_iterator *gsi,
4880 vec<tree> *result_chain)
4882 tree vect1, vect2, high, low;
4883 gimple *perm_stmt;
4884 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4885 tree perm_mask_low, perm_mask_high;
4886 tree data_ref;
4887 tree perm3_mask_low, perm3_mask_high;
4888 unsigned int i, n, log_length = exact_log2 (length);
4889 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4890 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4892 result_chain->quick_grow (length);
4893 memcpy (result_chain->address (), dr_chain.address (),
4894 length * sizeof (tree));
4896 if (length == 3)
4898 unsigned int j0 = 0, j1 = 0, j2 = 0;
4900 for (j = 0; j < 3; j++)
4902 int nelt0 = ((3 - j) * nelt) % 3;
4903 int nelt1 = ((3 - j) * nelt + 1) % 3;
4904 int nelt2 = ((3 - j) * nelt + 2) % 3;
4906 for (i = 0; i < nelt; i++)
4908 if (3 * i + nelt0 < nelt)
4909 sel[3 * i + nelt0] = j0++;
4910 if (3 * i + nelt1 < nelt)
4911 sel[3 * i + nelt1] = nelt + j1++;
4912 if (3 * i + nelt2 < nelt)
4913 sel[3 * i + nelt2] = 0;
4915 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4917 for (i = 0; i < nelt; i++)
4919 if (3 * i + nelt0 < nelt)
4920 sel[3 * i + nelt0] = 3 * i + nelt0;
4921 if (3 * i + nelt1 < nelt)
4922 sel[3 * i + nelt1] = 3 * i + nelt1;
4923 if (3 * i + nelt2 < nelt)
4924 sel[3 * i + nelt2] = nelt + j2++;
4926 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4928 vect1 = dr_chain[0];
4929 vect2 = dr_chain[1];
4931 /* Create interleaving stmt:
4932 low = VEC_PERM_EXPR <vect1, vect2,
4933 {j, nelt, *, j + 1, nelt + j + 1, *,
4934 j + 2, nelt + j + 2, *, ...}> */
4935 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4936 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4937 vect2, perm3_mask_low);
4938 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4940 vect1 = data_ref;
4941 vect2 = dr_chain[2];
4942 /* Create interleaving stmt:
4943 low = VEC_PERM_EXPR <vect1, vect2,
4944 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4945 6, 7, nelt + j + 2, ...}> */
4946 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4947 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4948 vect2, perm3_mask_high);
4949 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4950 (*result_chain)[j] = data_ref;
4953 else
4955 /* If length is not equal to 3 then only power of 2 is supported. */
4956 gcc_assert (pow2p_hwi (length));
4958 for (i = 0, n = nelt / 2; i < n; i++)
4960 sel[i * 2] = i;
4961 sel[i * 2 + 1] = i + nelt;
4963 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4965 for (i = 0; i < nelt; i++)
4966 sel[i] += nelt / 2;
4967 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4969 for (i = 0, n = log_length; i < n; i++)
4971 for (j = 0; j < length/2; j++)
4973 vect1 = dr_chain[j];
4974 vect2 = dr_chain[j+length/2];
4976 /* Create interleaving stmt:
4977 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4978 ...}> */
4979 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4980 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4981 vect2, perm_mask_high);
4982 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4983 (*result_chain)[2*j] = high;
4985 /* Create interleaving stmt:
4986 low = VEC_PERM_EXPR <vect1, vect2,
4987 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4988 ...}> */
4989 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4990 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4991 vect2, perm_mask_low);
4992 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4993 (*result_chain)[2*j+1] = low;
4995 memcpy (dr_chain.address (), result_chain->address (),
4996 length * sizeof (tree));
5001 /* Function vect_setup_realignment
5003 This function is called when vectorizing an unaligned load using
5004 the dr_explicit_realign[_optimized] scheme.
5005 This function generates the following code at the loop prolog:
5007 p = initial_addr;
5008 x msq_init = *(floor(p)); # prolog load
5009 realignment_token = call target_builtin;
5010 loop:
5011 x msq = phi (msq_init, ---)
5013 The stmts marked with x are generated only for the case of
5014 dr_explicit_realign_optimized.
5016 The code above sets up a new (vector) pointer, pointing to the first
5017 location accessed by STMT, and a "floor-aligned" load using that pointer.
5018 It also generates code to compute the "realignment-token" (if the relevant
5019 target hook was defined), and creates a phi-node at the loop-header bb
5020 whose arguments are the result of the prolog-load (created by this
5021 function) and the result of a load that takes place in the loop (to be
5022 created by the caller to this function).
5024 For the case of dr_explicit_realign_optimized:
5025 The caller to this function uses the phi-result (msq) to create the
5026 realignment code inside the loop, and sets up the missing phi argument,
5027 as follows:
5028 loop:
5029 msq = phi (msq_init, lsq)
5030 lsq = *(floor(p')); # load in loop
5031 result = realign_load (msq, lsq, realignment_token);
5033 For the case of dr_explicit_realign:
5034 loop:
5035 msq = *(floor(p)); # load in loop
5036 p' = p + (VS-1);
5037 lsq = *(floor(p')); # load in loop
5038 result = realign_load (msq, lsq, realignment_token);
5040 Input:
5041 STMT - (scalar) load stmt to be vectorized. This load accesses
5042 a memory location that may be unaligned.
5043 BSI - place where new code is to be inserted.
5044 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5045 is used.
5047 Output:
5048 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5049 target hook, if defined.
5050 Return value - the result of the loop-header phi node. */
5052 tree
5053 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5054 tree *realignment_token,
5055 enum dr_alignment_support alignment_support_scheme,
5056 tree init_addr,
5057 struct loop **at_loop)
5059 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5060 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5061 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5062 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5063 struct loop *loop = NULL;
5064 edge pe = NULL;
5065 tree scalar_dest = gimple_assign_lhs (stmt);
5066 tree vec_dest;
5067 gimple *inc;
5068 tree ptr;
5069 tree data_ref;
5070 basic_block new_bb;
5071 tree msq_init = NULL_TREE;
5072 tree new_temp;
5073 gphi *phi_stmt;
5074 tree msq = NULL_TREE;
5075 gimple_seq stmts = NULL;
5076 bool inv_p;
5077 bool compute_in_loop = false;
5078 bool nested_in_vect_loop = false;
5079 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5080 struct loop *loop_for_initial_load = NULL;
5082 if (loop_vinfo)
5084 loop = LOOP_VINFO_LOOP (loop_vinfo);
5085 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5088 gcc_assert (alignment_support_scheme == dr_explicit_realign
5089 || alignment_support_scheme == dr_explicit_realign_optimized);
5091 /* We need to generate three things:
5092 1. the misalignment computation
5093 2. the extra vector load (for the optimized realignment scheme).
5094 3. the phi node for the two vectors from which the realignment is
5095 done (for the optimized realignment scheme). */
5097 /* 1. Determine where to generate the misalignment computation.
5099 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5100 calculation will be generated by this function, outside the loop (in the
5101 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5102 caller, inside the loop.
5104 Background: If the misalignment remains fixed throughout the iterations of
5105 the loop, then both realignment schemes are applicable, and also the
5106 misalignment computation can be done outside LOOP. This is because we are
5107 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5108 are a multiple of VS (the Vector Size), and therefore the misalignment in
5109 different vectorized LOOP iterations is always the same.
5110 The problem arises only if the memory access is in an inner-loop nested
5111 inside LOOP, which is now being vectorized using outer-loop vectorization.
5112 This is the only case when the misalignment of the memory access may not
5113 remain fixed throughout the iterations of the inner-loop (as explained in
5114 detail in vect_supportable_dr_alignment). In this case, not only is the
5115 optimized realignment scheme not applicable, but also the misalignment
5116 computation (and generation of the realignment token that is passed to
5117 REALIGN_LOAD) have to be done inside the loop.
5119 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5120 or not, which in turn determines if the misalignment is computed inside
5121 the inner-loop, or outside LOOP. */
5123 if (init_addr != NULL_TREE || !loop_vinfo)
5125 compute_in_loop = true;
5126 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5130 /* 2. Determine where to generate the extra vector load.
5132 For the optimized realignment scheme, instead of generating two vector
5133 loads in each iteration, we generate a single extra vector load in the
5134 preheader of the loop, and in each iteration reuse the result of the
5135 vector load from the previous iteration. In case the memory access is in
5136 an inner-loop nested inside LOOP, which is now being vectorized using
5137 outer-loop vectorization, we need to determine whether this initial vector
5138 load should be generated at the preheader of the inner-loop, or can be
5139 generated at the preheader of LOOP. If the memory access has no evolution
5140 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5141 to be generated inside LOOP (in the preheader of the inner-loop). */
5143 if (nested_in_vect_loop)
5145 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5146 bool invariant_in_outerloop =
5147 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5148 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5150 else
5151 loop_for_initial_load = loop;
5152 if (at_loop)
5153 *at_loop = loop_for_initial_load;
5155 if (loop_for_initial_load)
5156 pe = loop_preheader_edge (loop_for_initial_load);
5158 /* 3. For the case of the optimized realignment, create the first vector
5159 load at the loop preheader. */
5161 if (alignment_support_scheme == dr_explicit_realign_optimized)
5163 /* Create msq_init = *(floor(p1)) in the loop preheader */
5164 gassign *new_stmt;
5166 gcc_assert (!compute_in_loop);
5167 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5168 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5169 NULL_TREE, &init_addr, NULL, &inc,
5170 true, &inv_p);
5171 if (TREE_CODE (ptr) == SSA_NAME)
5172 new_temp = copy_ssa_name (ptr);
5173 else
5174 new_temp = make_ssa_name (TREE_TYPE (ptr));
5175 new_stmt = gimple_build_assign
5176 (new_temp, BIT_AND_EXPR, ptr,
5177 build_int_cst (TREE_TYPE (ptr),
5178 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5179 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5180 gcc_assert (!new_bb);
5181 data_ref
5182 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5183 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5184 new_stmt = gimple_build_assign (vec_dest, data_ref);
5185 new_temp = make_ssa_name (vec_dest, new_stmt);
5186 gimple_assign_set_lhs (new_stmt, new_temp);
5187 if (pe)
5189 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5190 gcc_assert (!new_bb);
5192 else
5193 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5195 msq_init = gimple_assign_lhs (new_stmt);
5198 /* 4. Create realignment token using a target builtin, if available.
5199 It is done either inside the containing loop, or before LOOP (as
5200 determined above). */
5202 if (targetm.vectorize.builtin_mask_for_load)
5204 gcall *new_stmt;
5205 tree builtin_decl;
5207 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5208 if (!init_addr)
5210 /* Generate the INIT_ADDR computation outside LOOP. */
5211 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5212 NULL_TREE, loop);
5213 if (loop)
5215 pe = loop_preheader_edge (loop);
5216 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5217 gcc_assert (!new_bb);
5219 else
5220 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5223 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5224 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5225 vec_dest =
5226 vect_create_destination_var (scalar_dest,
5227 gimple_call_return_type (new_stmt));
5228 new_temp = make_ssa_name (vec_dest, new_stmt);
5229 gimple_call_set_lhs (new_stmt, new_temp);
5231 if (compute_in_loop)
5232 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5233 else
5235 /* Generate the misalignment computation outside LOOP. */
5236 pe = loop_preheader_edge (loop);
5237 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5238 gcc_assert (!new_bb);
5241 *realignment_token = gimple_call_lhs (new_stmt);
5243 /* The result of the CALL_EXPR to this builtin is determined from
5244 the value of the parameter and no global variables are touched
5245 which makes the builtin a "const" function. Requiring the
5246 builtin to have the "const" attribute makes it unnecessary
5247 to call mark_call_clobbered. */
5248 gcc_assert (TREE_READONLY (builtin_decl));
5251 if (alignment_support_scheme == dr_explicit_realign)
5252 return msq;
5254 gcc_assert (!compute_in_loop);
5255 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5258 /* 5. Create msq = phi <msq_init, lsq> in loop */
5260 pe = loop_preheader_edge (containing_loop);
5261 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5262 msq = make_ssa_name (vec_dest);
5263 phi_stmt = create_phi_node (msq, containing_loop->header);
5264 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5266 return msq;
5270 /* Function vect_grouped_load_supported.
5272 COUNT is the size of the load group (the number of statements plus the
5273 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5274 only one statement, with a gap of COUNT - 1.
5276 Returns true if a suitable permute exists. */
5278 bool
5279 vect_grouped_load_supported (tree vectype, bool single_element_p,
5280 unsigned HOST_WIDE_INT count)
5282 machine_mode mode = TYPE_MODE (vectype);
5284 /* If this is single-element interleaving with an element distance
5285 that leaves unused vector loads around punt - we at least create
5286 very sub-optimal code in that case (and blow up memory,
5287 see PR65518). */
5288 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5290 if (dump_enabled_p ())
5291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5292 "single-element interleaving not supported "
5293 "for not adjacent vector loads\n");
5294 return false;
5297 /* vect_permute_load_chain requires the group size to be equal to 3 or
5298 be a power of two. */
5299 if (count != 3 && exact_log2 (count) == -1)
5301 if (dump_enabled_p ())
5302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5303 "the size of the group of accesses"
5304 " is not a power of 2 or not equal to 3\n");
5305 return false;
5308 /* Check that the permutation is supported. */
5309 if (VECTOR_MODE_P (mode))
5311 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5312 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5314 if (count == 3)
5316 unsigned int k;
5317 for (k = 0; k < 3; k++)
5319 for (i = 0; i < nelt; i++)
5320 if (3 * i + k < 2 * nelt)
5321 sel[i] = 3 * i + k;
5322 else
5323 sel[i] = 0;
5324 if (!can_vec_perm_p (mode, false, sel))
5326 if (dump_enabled_p ())
5327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5328 "shuffle of 3 loads is not supported by"
5329 " target\n");
5330 return false;
5332 for (i = 0, j = 0; i < nelt; i++)
5333 if (3 * i + k < 2 * nelt)
5334 sel[i] = i;
5335 else
5336 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5337 if (!can_vec_perm_p (mode, false, sel))
5339 if (dump_enabled_p ())
5340 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5341 "shuffle of 3 loads is not supported by"
5342 " target\n");
5343 return false;
5346 return true;
5348 else
5350 /* If length is not equal to 3 then only power of 2 is supported. */
5351 gcc_assert (pow2p_hwi (count));
5352 for (i = 0; i < nelt; i++)
5353 sel[i] = i * 2;
5354 if (can_vec_perm_p (mode, false, sel))
5356 for (i = 0; i < nelt; i++)
5357 sel[i] = i * 2 + 1;
5358 if (can_vec_perm_p (mode, false, sel))
5359 return true;
5364 if (dump_enabled_p ())
5365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5366 "extract even/odd not supported by target\n");
5367 return false;
5370 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5371 type VECTYPE. */
5373 bool
5374 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5376 return vect_lanes_optab_supported_p ("vec_load_lanes",
5377 vec_load_lanes_optab,
5378 vectype, count);
5381 /* Function vect_permute_load_chain.
5383 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5384 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5385 the input data correctly. Return the final references for loads in
5386 RESULT_CHAIN.
5388 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5389 The input is 4 vectors each containing 8 elements. We assign a number to each
5390 element, the input sequence is:
5392 1st vec: 0 1 2 3 4 5 6 7
5393 2nd vec: 8 9 10 11 12 13 14 15
5394 3rd vec: 16 17 18 19 20 21 22 23
5395 4th vec: 24 25 26 27 28 29 30 31
5397 The output sequence should be:
5399 1st vec: 0 4 8 12 16 20 24 28
5400 2nd vec: 1 5 9 13 17 21 25 29
5401 3rd vec: 2 6 10 14 18 22 26 30
5402 4th vec: 3 7 11 15 19 23 27 31
5404 i.e., the first output vector should contain the first elements of each
5405 interleaving group, etc.
5407 We use extract_even/odd instructions to create such output. The input of
5408 each extract_even/odd operation is two vectors
5409 1st vec 2nd vec
5410 0 1 2 3 4 5 6 7
5412 and the output is the vector of extracted even/odd elements. The output of
5413 extract_even will be: 0 2 4 6
5414 and of extract_odd: 1 3 5 7
5417 The permutation is done in log LENGTH stages. In each stage extract_even
5418 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5419 their order. In our example,
5421 E1: extract_even (1st vec, 2nd vec)
5422 E2: extract_odd (1st vec, 2nd vec)
5423 E3: extract_even (3rd vec, 4th vec)
5424 E4: extract_odd (3rd vec, 4th vec)
5426 The output for the first stage will be:
5428 E1: 0 2 4 6 8 10 12 14
5429 E2: 1 3 5 7 9 11 13 15
5430 E3: 16 18 20 22 24 26 28 30
5431 E4: 17 19 21 23 25 27 29 31
5433 In order to proceed and create the correct sequence for the next stage (or
5434 for the correct output, if the second stage is the last one, as in our
5435 example), we first put the output of extract_even operation and then the
5436 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5437 The input for the second stage is:
5439 1st vec (E1): 0 2 4 6 8 10 12 14
5440 2nd vec (E3): 16 18 20 22 24 26 28 30
5441 3rd vec (E2): 1 3 5 7 9 11 13 15
5442 4th vec (E4): 17 19 21 23 25 27 29 31
5444 The output of the second stage:
5446 E1: 0 4 8 12 16 20 24 28
5447 E2: 2 6 10 14 18 22 26 30
5448 E3: 1 5 9 13 17 21 25 29
5449 E4: 3 7 11 15 19 23 27 31
5451 And RESULT_CHAIN after reordering:
5453 1st vec (E1): 0 4 8 12 16 20 24 28
5454 2nd vec (E3): 1 5 9 13 17 21 25 29
5455 3rd vec (E2): 2 6 10 14 18 22 26 30
5456 4th vec (E4): 3 7 11 15 19 23 27 31. */
5458 static void
5459 vect_permute_load_chain (vec<tree> dr_chain,
5460 unsigned int length,
5461 gimple *stmt,
5462 gimple_stmt_iterator *gsi,
5463 vec<tree> *result_chain)
5465 tree data_ref, first_vect, second_vect;
5466 tree perm_mask_even, perm_mask_odd;
5467 tree perm3_mask_low, perm3_mask_high;
5468 gimple *perm_stmt;
5469 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5470 unsigned int i, j, log_length = exact_log2 (length);
5471 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5472 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5474 result_chain->quick_grow (length);
5475 memcpy (result_chain->address (), dr_chain.address (),
5476 length * sizeof (tree));
5478 if (length == 3)
5480 unsigned int k;
5482 for (k = 0; k < 3; k++)
5484 for (i = 0; i < nelt; i++)
5485 if (3 * i + k < 2 * nelt)
5486 sel[i] = 3 * i + k;
5487 else
5488 sel[i] = 0;
5489 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5491 for (i = 0, j = 0; i < nelt; i++)
5492 if (3 * i + k < 2 * nelt)
5493 sel[i] = i;
5494 else
5495 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5497 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5499 first_vect = dr_chain[0];
5500 second_vect = dr_chain[1];
5502 /* Create interleaving stmt (low part of):
5503 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5504 ...}> */
5505 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5506 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5507 second_vect, perm3_mask_low);
5508 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5510 /* Create interleaving stmt (high part of):
5511 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5512 ...}> */
5513 first_vect = data_ref;
5514 second_vect = dr_chain[2];
5515 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5516 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5517 second_vect, perm3_mask_high);
5518 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5519 (*result_chain)[k] = data_ref;
5522 else
5524 /* If length is not equal to 3 then only power of 2 is supported. */
5525 gcc_assert (pow2p_hwi (length));
5527 for (i = 0; i < nelt; ++i)
5528 sel[i] = i * 2;
5529 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5531 for (i = 0; i < nelt; ++i)
5532 sel[i] = i * 2 + 1;
5533 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5535 for (i = 0; i < log_length; i++)
5537 for (j = 0; j < length; j += 2)
5539 first_vect = dr_chain[j];
5540 second_vect = dr_chain[j+1];
5542 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5543 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5544 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5545 first_vect, second_vect,
5546 perm_mask_even);
5547 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5548 (*result_chain)[j/2] = data_ref;
5550 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5551 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5552 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5553 first_vect, second_vect,
5554 perm_mask_odd);
5555 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5556 (*result_chain)[j/2+length/2] = data_ref;
5558 memcpy (dr_chain.address (), result_chain->address (),
5559 length * sizeof (tree));
5564 /* Function vect_shift_permute_load_chain.
5566 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5567 sequence of stmts to reorder the input data accordingly.
5568 Return the final references for loads in RESULT_CHAIN.
5569 Return true if successed, false otherwise.
5571 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5572 The input is 3 vectors each containing 8 elements. We assign a
5573 number to each element, the input sequence is:
5575 1st vec: 0 1 2 3 4 5 6 7
5576 2nd vec: 8 9 10 11 12 13 14 15
5577 3rd vec: 16 17 18 19 20 21 22 23
5579 The output sequence should be:
5581 1st vec: 0 3 6 9 12 15 18 21
5582 2nd vec: 1 4 7 10 13 16 19 22
5583 3rd vec: 2 5 8 11 14 17 20 23
5585 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5587 First we shuffle all 3 vectors to get correct elements order:
5589 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5590 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5591 3rd vec: (16 19 22) (17 20 23) (18 21)
5593 Next we unite and shift vector 3 times:
5595 1st step:
5596 shift right by 6 the concatenation of:
5597 "1st vec" and "2nd vec"
5598 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5599 "2nd vec" and "3rd vec"
5600 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5601 "3rd vec" and "1st vec"
5602 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5603 | New vectors |
5605 So that now new vectors are:
5607 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5608 2nd vec: (10 13) (16 19 22) (17 20 23)
5609 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5611 2nd step:
5612 shift right by 5 the concatenation of:
5613 "1st vec" and "3rd vec"
5614 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5615 "2nd vec" and "1st vec"
5616 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5617 "3rd vec" and "2nd vec"
5618 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5619 | New vectors |
5621 So that now new vectors are:
5623 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5624 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5625 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5627 3rd step:
5628 shift right by 5 the concatenation of:
5629 "1st vec" and "1st vec"
5630 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5631 shift right by 3 the concatenation of:
5632 "2nd vec" and "2nd vec"
5633 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5634 | New vectors |
5636 So that now all vectors are READY:
5637 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5638 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5639 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5641 This algorithm is faster than one in vect_permute_load_chain if:
5642 1. "shift of a concatination" is faster than general permutation.
5643 This is usually so.
5644 2. The TARGET machine can't execute vector instructions in parallel.
5645 This is because each step of the algorithm depends on previous.
5646 The algorithm in vect_permute_load_chain is much more parallel.
5648 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5651 static bool
5652 vect_shift_permute_load_chain (vec<tree> dr_chain,
5653 unsigned int length,
5654 gimple *stmt,
5655 gimple_stmt_iterator *gsi,
5656 vec<tree> *result_chain)
5658 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5659 tree perm2_mask1, perm2_mask2, perm3_mask;
5660 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5661 gimple *perm_stmt;
5663 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5664 unsigned int i;
5665 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5666 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5667 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5668 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5670 result_chain->quick_grow (length);
5671 memcpy (result_chain->address (), dr_chain.address (),
5672 length * sizeof (tree));
5674 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5676 unsigned int j, log_length = exact_log2 (length);
5677 for (i = 0; i < nelt / 2; ++i)
5678 sel[i] = i * 2;
5679 for (i = 0; i < nelt / 2; ++i)
5680 sel[nelt / 2 + i] = i * 2 + 1;
5681 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5683 if (dump_enabled_p ())
5684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5685 "shuffle of 2 fields structure is not \
5686 supported by target\n");
5687 return false;
5689 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5691 for (i = 0; i < nelt / 2; ++i)
5692 sel[i] = i * 2 + 1;
5693 for (i = 0; i < nelt / 2; ++i)
5694 sel[nelt / 2 + i] = i * 2;
5695 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5697 if (dump_enabled_p ())
5698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5699 "shuffle of 2 fields structure is not \
5700 supported by target\n");
5701 return false;
5703 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5705 /* Generating permutation constant to shift all elements.
5706 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5707 for (i = 0; i < nelt; i++)
5708 sel[i] = nelt / 2 + i;
5709 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5711 if (dump_enabled_p ())
5712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5713 "shift permutation is not supported by target\n");
5714 return false;
5716 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5718 /* Generating permutation constant to select vector from 2.
5719 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5720 for (i = 0; i < nelt / 2; i++)
5721 sel[i] = i;
5722 for (i = nelt / 2; i < nelt; i++)
5723 sel[i] = nelt + i;
5724 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5726 if (dump_enabled_p ())
5727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728 "select is not supported by target\n");
5729 return false;
5731 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5733 for (i = 0; i < log_length; i++)
5735 for (j = 0; j < length; j += 2)
5737 first_vect = dr_chain[j];
5738 second_vect = dr_chain[j + 1];
5740 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5741 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5742 first_vect, first_vect,
5743 perm2_mask1);
5744 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5745 vect[0] = data_ref;
5747 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5748 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5749 second_vect, second_vect,
5750 perm2_mask2);
5751 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5752 vect[1] = data_ref;
5754 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5755 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5756 vect[0], vect[1], shift1_mask);
5757 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5758 (*result_chain)[j/2 + length/2] = data_ref;
5760 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5761 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5762 vect[0], vect[1], select_mask);
5763 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5764 (*result_chain)[j/2] = data_ref;
5766 memcpy (dr_chain.address (), result_chain->address (),
5767 length * sizeof (tree));
5769 return true;
5771 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5773 unsigned int k = 0, l = 0;
5775 /* Generating permutation constant to get all elements in rigth order.
5776 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5777 for (i = 0; i < nelt; i++)
5779 if (3 * k + (l % 3) >= nelt)
5781 k = 0;
5782 l += (3 - (nelt % 3));
5784 sel[i] = 3 * k + (l % 3);
5785 k++;
5787 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5789 if (dump_enabled_p ())
5790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5791 "shuffle of 3 fields structure is not \
5792 supported by target\n");
5793 return false;
5795 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5797 /* Generating permutation constant to shift all elements.
5798 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5799 for (i = 0; i < nelt; i++)
5800 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5801 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5803 if (dump_enabled_p ())
5804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805 "shift permutation is not supported by target\n");
5806 return false;
5808 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5810 /* Generating permutation constant to shift all elements.
5811 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5812 for (i = 0; i < nelt; i++)
5813 sel[i] = 2 * (nelt / 3) + 1 + i;
5814 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5816 if (dump_enabled_p ())
5817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5818 "shift permutation is not supported by target\n");
5819 return false;
5821 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5823 /* Generating permutation constant to shift all elements.
5824 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5825 for (i = 0; i < nelt; i++)
5826 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5827 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5829 if (dump_enabled_p ())
5830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5831 "shift permutation is not supported by target\n");
5832 return false;
5834 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5836 /* Generating permutation constant to shift all elements.
5837 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5838 for (i = 0; i < nelt; i++)
5839 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5840 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5842 if (dump_enabled_p ())
5843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5844 "shift permutation is not supported by target\n");
5845 return false;
5847 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5849 for (k = 0; k < 3; k++)
5851 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5852 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5853 dr_chain[k], dr_chain[k],
5854 perm3_mask);
5855 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5856 vect[k] = data_ref;
5859 for (k = 0; k < 3; k++)
5861 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5862 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5863 vect[k % 3], vect[(k + 1) % 3],
5864 shift1_mask);
5865 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5866 vect_shift[k] = data_ref;
5869 for (k = 0; k < 3; k++)
5871 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5872 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5873 vect_shift[(4 - k) % 3],
5874 vect_shift[(3 - k) % 3],
5875 shift2_mask);
5876 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5877 vect[k] = data_ref;
5880 (*result_chain)[3 - (nelt % 3)] = vect[2];
5882 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5883 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5884 vect[0], shift3_mask);
5885 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5886 (*result_chain)[nelt % 3] = data_ref;
5888 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5889 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5890 vect[1], shift4_mask);
5891 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5892 (*result_chain)[0] = data_ref;
5893 return true;
5895 return false;
5898 /* Function vect_transform_grouped_load.
5900 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5901 to perform their permutation and ascribe the result vectorized statements to
5902 the scalar statements.
5905 void
5906 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5907 gimple_stmt_iterator *gsi)
5909 machine_mode mode;
5910 vec<tree> result_chain = vNULL;
5912 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5913 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5914 vectors, that are ready for vector computation. */
5915 result_chain.create (size);
5917 /* If reassociation width for vector type is 2 or greater target machine can
5918 execute 2 or more vector instructions in parallel. Otherwise try to
5919 get chain for loads group using vect_shift_permute_load_chain. */
5920 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5921 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5922 || pow2p_hwi (size)
5923 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5924 gsi, &result_chain))
5925 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5926 vect_record_grouped_load_vectors (stmt, result_chain);
5927 result_chain.release ();
5930 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5931 generated as part of the vectorization of STMT. Assign the statement
5932 for each vector to the associated scalar statement. */
5934 void
5935 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5937 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5938 gimple *next_stmt, *new_stmt;
5939 unsigned int i, gap_count;
5940 tree tmp_data_ref;
5942 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5943 Since we scan the chain starting from it's first node, their order
5944 corresponds the order of data-refs in RESULT_CHAIN. */
5945 next_stmt = first_stmt;
5946 gap_count = 1;
5947 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5949 if (!next_stmt)
5950 break;
5952 /* Skip the gaps. Loads created for the gaps will be removed by dead
5953 code elimination pass later. No need to check for the first stmt in
5954 the group, since it always exists.
5955 GROUP_GAP is the number of steps in elements from the previous
5956 access (if there is no gap GROUP_GAP is 1). We skip loads that
5957 correspond to the gaps. */
5958 if (next_stmt != first_stmt
5959 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5961 gap_count++;
5962 continue;
5965 while (next_stmt)
5967 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5968 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5969 copies, and we put the new vector statement in the first available
5970 RELATED_STMT. */
5971 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5972 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5973 else
5975 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5977 gimple *prev_stmt =
5978 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5979 gimple *rel_stmt =
5980 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5981 while (rel_stmt)
5983 prev_stmt = rel_stmt;
5984 rel_stmt =
5985 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5988 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5989 new_stmt;
5993 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5994 gap_count = 1;
5995 /* If NEXT_STMT accesses the same DR as the previous statement,
5996 put the same TMP_DATA_REF as its vectorized statement; otherwise
5997 get the next data-ref from RESULT_CHAIN. */
5998 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5999 break;
6004 /* Function vect_force_dr_alignment_p.
6006 Returns whether the alignment of a DECL can be forced to be aligned
6007 on ALIGNMENT bit boundary. */
6009 bool
6010 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6012 if (!VAR_P (decl))
6013 return false;
6015 if (decl_in_symtab_p (decl)
6016 && !symtab_node::get (decl)->can_increase_alignment_p ())
6017 return false;
6019 if (TREE_STATIC (decl))
6020 return (alignment <= MAX_OFILE_ALIGNMENT);
6021 else
6022 return (alignment <= MAX_STACK_ALIGNMENT);
6026 /* Return whether the data reference DR is supported with respect to its
6027 alignment.
6028 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6029 it is aligned, i.e., check if it is possible to vectorize it with different
6030 alignment. */
6032 enum dr_alignment_support
6033 vect_supportable_dr_alignment (struct data_reference *dr,
6034 bool check_aligned_accesses)
6036 gimple *stmt = DR_STMT (dr);
6037 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6038 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6039 machine_mode mode = TYPE_MODE (vectype);
6040 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6041 struct loop *vect_loop = NULL;
6042 bool nested_in_vect_loop = false;
6044 if (aligned_access_p (dr) && !check_aligned_accesses)
6045 return dr_aligned;
6047 /* For now assume all conditional loads/stores support unaligned
6048 access without any special code. */
6049 if (is_gimple_call (stmt)
6050 && gimple_call_internal_p (stmt)
6051 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6052 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6053 return dr_unaligned_supported;
6055 if (loop_vinfo)
6057 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6058 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6061 /* Possibly unaligned access. */
6063 /* We can choose between using the implicit realignment scheme (generating
6064 a misaligned_move stmt) and the explicit realignment scheme (generating
6065 aligned loads with a REALIGN_LOAD). There are two variants to the
6066 explicit realignment scheme: optimized, and unoptimized.
6067 We can optimize the realignment only if the step between consecutive
6068 vector loads is equal to the vector size. Since the vector memory
6069 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6070 is guaranteed that the misalignment amount remains the same throughout the
6071 execution of the vectorized loop. Therefore, we can create the
6072 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6073 at the loop preheader.
6075 However, in the case of outer-loop vectorization, when vectorizing a
6076 memory access in the inner-loop nested within the LOOP that is now being
6077 vectorized, while it is guaranteed that the misalignment of the
6078 vectorized memory access will remain the same in different outer-loop
6079 iterations, it is *not* guaranteed that is will remain the same throughout
6080 the execution of the inner-loop. This is because the inner-loop advances
6081 with the original scalar step (and not in steps of VS). If the inner-loop
6082 step happens to be a multiple of VS, then the misalignment remains fixed
6083 and we can use the optimized realignment scheme. For example:
6085 for (i=0; i<N; i++)
6086 for (j=0; j<M; j++)
6087 s += a[i+j];
6089 When vectorizing the i-loop in the above example, the step between
6090 consecutive vector loads is 1, and so the misalignment does not remain
6091 fixed across the execution of the inner-loop, and the realignment cannot
6092 be optimized (as illustrated in the following pseudo vectorized loop):
6094 for (i=0; i<N; i+=4)
6095 for (j=0; j<M; j++){
6096 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6097 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6098 // (assuming that we start from an aligned address).
6101 We therefore have to use the unoptimized realignment scheme:
6103 for (i=0; i<N; i+=4)
6104 for (j=k; j<M; j+=4)
6105 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6106 // that the misalignment of the initial address is
6107 // 0).
6109 The loop can then be vectorized as follows:
6111 for (k=0; k<4; k++){
6112 rt = get_realignment_token (&vp[k]);
6113 for (i=0; i<N; i+=4){
6114 v1 = vp[i+k];
6115 for (j=k; j<M; j+=4){
6116 v2 = vp[i+j+VS-1];
6117 va = REALIGN_LOAD <v1,v2,rt>;
6118 vs += va;
6119 v1 = v2;
6122 } */
6124 if (DR_IS_READ (dr))
6126 bool is_packed = false;
6127 tree type = (TREE_TYPE (DR_REF (dr)));
6129 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6130 && (!targetm.vectorize.builtin_mask_for_load
6131 || targetm.vectorize.builtin_mask_for_load ()))
6133 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6135 /* If we are doing SLP then the accesses need not have the
6136 same alignment, instead it depends on the SLP group size. */
6137 if (loop_vinfo
6138 && STMT_SLP_TYPE (stmt_info)
6139 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6140 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6141 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6143 else if (!loop_vinfo
6144 || (nested_in_vect_loop
6145 && (TREE_INT_CST_LOW (DR_STEP (dr))
6146 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6147 return dr_explicit_realign;
6148 else
6149 return dr_explicit_realign_optimized;
6151 if (!known_alignment_for_access_p (dr))
6152 is_packed = not_size_aligned (DR_REF (dr));
6154 if ((TYPE_USER_ALIGN (type) && !is_packed)
6155 || targetm.vectorize.
6156 support_vector_misalignment (mode, type,
6157 DR_MISALIGNMENT (dr), is_packed))
6158 /* Can't software pipeline the loads, but can at least do them. */
6159 return dr_unaligned_supported;
6161 else
6163 bool is_packed = false;
6164 tree type = (TREE_TYPE (DR_REF (dr)));
6166 if (!known_alignment_for_access_p (dr))
6167 is_packed = not_size_aligned (DR_REF (dr));
6169 if ((TYPE_USER_ALIGN (type) && !is_packed)
6170 || targetm.vectorize.
6171 support_vector_misalignment (mode, type,
6172 DR_MISALIGNMENT (dr), is_packed))
6173 return dr_unaligned_supported;
6176 /* Unsupported. */
6177 return dr_unaligned_unsupported;