[testsuite] Require shared effective target for some lto.exp tests
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobc1fbbc958e07051e33e58774c877863adcafa200
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE, vect_location,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158 dump_printf (MSG_NOTE, " and ");
159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160 dump_printf (MSG_NOTE, "\n");
163 if (optimize_loop_nest_for_size_p (loop))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167 "versioning not supported when optimizing"
168 " for size.\n");
169 return false;
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
173 if (loop->inner)
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177 "versioning not yet supported for outer-loops.\n");
178 return false;
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 step. */
183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188 "versioning not yet supported for non-constant "
189 "step\n");
190 return false;
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194 return true;
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
205 static bool
206 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
207 loop_vec_info loop_vinfo, int *max_vf)
209 unsigned int i;
210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211 struct data_reference *dra = DDR_A (ddr);
212 struct data_reference *drb = DDR_B (ddr);
213 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
214 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
215 lambda_vector dist_v;
216 unsigned int loop_depth;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
221 gcc_unreachable ();
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
225 return false;
227 if (dra == drb
228 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
229 return false;
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
235 return false;
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
244 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
246 get_alias_set (DR_REF (drb))))
247 return false;
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop->safelen >= 2)
256 if (loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 return false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
271 DR_REF (dra));
272 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (drb));
275 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
277 return true;
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (dra));
287 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (drb));
290 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop->safelen >= 2)
304 if (loop->safelen < *max_vf)
305 *max_vf = loop->safelen;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
307 return false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (dra));
320 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
322 DR_REF (drb));
323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
336 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
342 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
345 int dist = dist_v[loop_depth];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance = %d.\n", dist);
351 if (dist == 0)
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_NOTE, " and ");
359 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
366 .. = a[i];
367 .. = a[i+1];
368 a[i] = ..;
369 a[i+1] = ..;
370 *p = ..;
371 .. = a[i];
372 .. = a[i+1];
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
377 a[i] = ...;
378 ... = a[i];
379 a[i+1] = ...;
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
384 gimple *earlier_stmt;
385 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
386 if (DR_IS_WRITE
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "READ_WRITE dependence in interleaving."
392 "\n");
393 return true;
397 continue;
400 if (dist > 0 && DDR_REVERSED_P (ddr))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
411 if (DR_IS_READ (drb)
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
415 continue;
418 if (abs (dist) >= 2
419 && abs (dist) < *max_vf)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf = abs (dist);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "adjusting maximal vectorization factor to %i\n",
427 *max_vf);
430 if (abs (dist) >= *max_vf)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "dependence distance >= VF.\n");
437 continue;
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
446 dump_printf (MSG_NOTE, " and ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
448 dump_printf (MSG_NOTE, "\n");
451 return true;
454 return false;
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
463 bool
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
466 unsigned int i;
467 struct data_dependence_relation *ddr;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
479 &LOOP_VINFO_DDRS (loop_vinfo),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
481 return false;
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
487 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
488 else
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
490 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
491 return false;
493 return true;
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
504 static bool
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
507 struct data_reference *dra = DDR_A (ddr);
508 struct data_reference *drb = DDR_B (ddr);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
515 return false;
517 if (dra == drb)
518 return false;
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra) && DR_IS_READ (drb))
522 return false;
524 /* If dra and drb are part of the same interleaving chain consider
525 them independent. */
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
529 return false;
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE, vect_location,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
549 dump_printf (MSG_NOTE, " and ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
551 dump_printf (MSG_NOTE, "\n");
554 return true;
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
562 static bool
563 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
564 vec<gimple *> stores, gimple *last_store)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
568 group. */
569 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
570 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
572 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
573 if (access == last_access)
574 continue;
575 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
576 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
577 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
579 gimple *stmt = gsi_stmt (gsi);
580 if (! gimple_vuse (stmt)
581 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
582 continue;
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
589 if (!dr_b)
590 return false;
592 bool dependent = false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt))
599 if (stmt != last_store)
600 continue;
601 unsigned i;
602 gimple *store;
603 FOR_EACH_VEC_ELT (stores, i, store)
605 data_reference *store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
607 ddr_p ddr = initialize_data_dependence_relation
608 (dr_a, store_dr, vNULL);
609 dependent = vect_slp_analyze_data_ref_dependence (ddr);
610 free_dependence_relation (ddr);
611 if (dependent)
612 break;
615 else
617 ddr_p ddr = initialize_data_dependence_relation (dr_a,
618 dr_b, vNULL);
619 dependent = vect_slp_analyze_data_ref_dependence (ddr);
620 free_dependence_relation (ddr);
622 if (dependent)
623 return false;
626 return true;
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
636 bool
637 vect_slp_analyze_instance_dependence (slp_instance instance)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store = SLP_INSTANCE_TREE (instance);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
646 store = NULL;
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple *last_store = NULL;
650 if (store)
652 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
653 return false;
655 /* Mark stores in this instance and remember the last one. */
656 last_store = vect_find_last_scalar_stmt_in_slp (store);
657 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
661 bool res = true;
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
665 slp_tree load;
666 unsigned int i;
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
668 if (! vect_slp_analyze_node_dependences (instance, load,
669 store
670 ? SLP_TREE_SCALAR_STMTS (store)
671 : vNULL, last_store))
673 res = false;
674 break;
677 /* Unset the visited flag. */
678 if (store)
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
682 return res;
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
689 Output:
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
697 bool
698 vect_compute_data_ref_alignment (struct data_reference *dr)
700 gimple *stmt = DR_STMT (dr);
701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
702 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
703 struct loop *loop = NULL;
704 tree ref = DR_REF (dr);
705 tree vectype;
706 tree base, base_addr;
707 tree misalign = NULL_TREE;
708 tree aligned_to;
709 tree step;
710 unsigned HOST_WIDE_INT alignment;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "vect_compute_data_ref_alignment:\n");
716 if (loop_vinfo)
717 loop = LOOP_VINFO_LOOP (loop_vinfo);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr)))
723 misalign = DR_INIT (dr);
724 aligned_to = DR_ALIGNED_TO (dr);
725 base_addr = DR_BASE_ADDRESS (dr);
726 vectype = STMT_VINFO_VECTYPE (stmt_info);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop && nested_in_vect_loop_p (loop, stmt))
736 tree step = DR_STEP (dr);
738 if (tree_fits_shwi_p (step)
739 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location,
743 "inner step divides the vector-size.\n");
744 misalign = STMT_VINFO_DR_INIT (stmt_info);
745 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
746 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
748 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "inner step doesn't divide the vector-size.\n");
753 misalign = NULL_TREE;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
761 else
763 tree step = DR_STEP (dr);
764 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
766 if (tree_fits_shwi_p (step)
767 && ((tree_to_shwi (step) * vf)
768 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "step doesn't divide the vector-size.\n");
773 misalign = NULL_TREE;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
779 base = ref;
780 while (handled_component_p (base))
781 base = TREE_OPERAND (base, 0);
782 unsigned int base_alignment;
783 unsigned HOST_WIDE_INT base_bitpos;
784 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
788 DR_BASE_ADDRESS. */
789 if (TREE_CODE (base) == MEM_REF)
791 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
792 base_bitpos &= (base_alignment - 1);
794 if (base_bitpos != 0)
795 base_alignment = base_bitpos & -base_bitpos;
796 /* Also look at the alignment of the base address DR analysis
797 computed. */
798 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
799 if (base_addr_alignment > base_alignment)
800 base_alignment = base_addr_alignment;
802 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
803 DR_VECT_AUX (dr)->base_element_aligned = true;
805 alignment = TYPE_ALIGN_UNIT (vectype);
807 if ((compare_tree_int (aligned_to, alignment) < 0)
808 || !misalign)
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
813 "Unknown alignment for access: ");
814 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
815 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
817 return true;
820 if (base_alignment < TYPE_ALIGN (vectype))
822 base = base_addr;
823 if (TREE_CODE (base) == ADDR_EXPR)
824 base = TREE_OPERAND (base, 0);
825 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
827 if (dump_enabled_p ())
829 dump_printf_loc (MSG_NOTE, vect_location,
830 "can't force alignment of ref: ");
831 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
832 dump_printf (MSG_NOTE, "\n");
834 return true;
837 if (DECL_USER_ALIGN (base))
839 if (dump_enabled_p ())
841 dump_printf_loc (MSG_NOTE, vect_location,
842 "not forcing alignment of user-aligned "
843 "variable: ");
844 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
845 dump_printf (MSG_NOTE, "\n");
847 return true;
850 /* Force the alignment of the decl.
851 NOTE: This is the only change to the code we make during
852 the analysis phase, before deciding to vectorize the loop. */
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
856 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
857 dump_printf (MSG_NOTE, "\n");
860 DR_VECT_AUX (dr)->base_decl = base;
861 DR_VECT_AUX (dr)->base_misaligned = true;
862 DR_VECT_AUX (dr)->base_element_aligned = true;
865 if (loop && nested_in_vect_loop_p (loop, stmt))
866 step = STMT_VINFO_DR_STEP (stmt_info);
867 else
868 step = DR_STEP (dr);
869 /* If this is a backward running DR then first access in the larger
870 vectype actually is N-1 elements before the address in the DR.
871 Adjust misalign accordingly. */
872 if (tree_int_cst_sgn (step) < 0)
874 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
875 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
876 otherwise we wouldn't be here. */
877 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
878 /* PLUS because STEP was negative. */
879 misalign = size_binop (PLUS_EXPR, misalign, offset);
882 SET_DR_MISALIGNMENT (dr,
883 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
885 if (dump_enabled_p ())
887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
888 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
889 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
890 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
893 return true;
897 /* Function vect_update_misalignment_for_peel
899 DR - the data reference whose misalignment is to be adjusted.
900 DR_PEEL - the data reference whose misalignment is being made
901 zero in the vector loop by the peel.
902 NPEEL - the number of iterations in the peel loop if the misalignment
903 of DR_PEEL is known at compile time. */
905 static void
906 vect_update_misalignment_for_peel (struct data_reference *dr,
907 struct data_reference *dr_peel, int npeel)
909 unsigned int i;
910 vec<dr_p> same_align_drs;
911 struct data_reference *current_dr;
912 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
913 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
914 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
915 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
917 /* For interleaved data accesses the step in the loop must be multiplied by
918 the size of the interleaving group. */
919 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
920 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
921 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
922 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
924 /* It can be assumed that the data refs with the same alignment as dr_peel
925 are aligned in the vector loop. */
926 same_align_drs
927 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
928 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
930 if (current_dr != dr)
931 continue;
932 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
933 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
934 SET_DR_MISALIGNMENT (dr, 0);
935 return;
938 if (known_alignment_for_access_p (dr)
939 && known_alignment_for_access_p (dr_peel))
941 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
942 int misal = DR_MISALIGNMENT (dr);
943 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
944 misal += negative ? -npeel * dr_size : npeel * dr_size;
945 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
946 SET_DR_MISALIGNMENT (dr, misal);
947 return;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
952 SET_DR_MISALIGNMENT (dr, -1);
956 /* Function verify_data_ref_alignment
958 Return TRUE if DR can be handled with respect to alignment. */
960 static bool
961 verify_data_ref_alignment (data_reference_p dr)
963 enum dr_alignment_support supportable_dr_alignment
964 = vect_supportable_dr_alignment (dr, false);
965 if (!supportable_dr_alignment)
967 if (dump_enabled_p ())
969 if (DR_IS_READ (dr))
970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
971 "not vectorized: unsupported unaligned load.");
972 else
973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
974 "not vectorized: unsupported unaligned "
975 "store.");
977 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
978 DR_REF (dr));
979 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
981 return false;
984 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
985 dump_printf_loc (MSG_NOTE, vect_location,
986 "Vectorizing an unaligned access.\n");
988 return true;
991 /* Function vect_verify_datarefs_alignment
993 Return TRUE if all data references in the loop can be
994 handled with respect to alignment. */
996 bool
997 vect_verify_datarefs_alignment (loop_vec_info vinfo)
999 vec<data_reference_p> datarefs = vinfo->datarefs;
1000 struct data_reference *dr;
1001 unsigned int i;
1003 FOR_EACH_VEC_ELT (datarefs, i, dr)
1005 gimple *stmt = DR_STMT (dr);
1006 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1009 continue;
1011 /* For interleaving, only the alignment of the first access matters. */
1012 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1013 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1014 continue;
1016 /* Strided accesses perform only component accesses, alignment is
1017 irrelevant for them. */
1018 if (STMT_VINFO_STRIDED_P (stmt_info)
1019 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1020 continue;
1022 if (! verify_data_ref_alignment (dr))
1023 return false;
1026 return true;
1029 /* Given an memory reference EXP return whether its alignment is less
1030 than its size. */
1032 static bool
1033 not_size_aligned (tree exp)
1035 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1036 return true;
1038 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1039 > get_object_alignment (exp));
1042 /* Function vector_alignment_reachable_p
1044 Return true if vector alignment for DR is reachable by peeling
1045 a few loop iterations. Return false otherwise. */
1047 static bool
1048 vector_alignment_reachable_p (struct data_reference *dr)
1050 gimple *stmt = DR_STMT (dr);
1051 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1052 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1054 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1056 /* For interleaved access we peel only if number of iterations in
1057 the prolog loop ({VF - misalignment}), is a multiple of the
1058 number of the interleaved accesses. */
1059 int elem_size, mis_in_elements;
1060 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1062 /* FORNOW: handle only known alignment. */
1063 if (!known_alignment_for_access_p (dr))
1064 return false;
1066 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1067 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1069 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1070 return false;
1073 /* If misalignment is known at the compile time then allow peeling
1074 only if natural alignment is reachable through peeling. */
1075 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1077 HOST_WIDE_INT elmsize =
1078 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1079 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_NOTE, vect_location,
1082 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1083 dump_printf (MSG_NOTE,
1084 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1086 if (DR_MISALIGNMENT (dr) % elmsize)
1088 if (dump_enabled_p ())
1089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090 "data size does not divide the misalignment.\n");
1091 return false;
1095 if (!known_alignment_for_access_p (dr))
1097 tree type = TREE_TYPE (DR_REF (dr));
1098 bool is_packed = not_size_aligned (DR_REF (dr));
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "Unknown misalignment, is_packed = %d\n",is_packed);
1102 if ((TYPE_USER_ALIGN (type) && !is_packed)
1103 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1104 return true;
1105 else
1106 return false;
1109 return true;
1113 /* Calculate the cost of the memory access represented by DR. */
1115 static void
1116 vect_get_data_access_cost (struct data_reference *dr,
1117 unsigned int *inside_cost,
1118 unsigned int *outside_cost,
1119 stmt_vector_for_cost *body_cost_vec)
1121 gimple *stmt = DR_STMT (dr);
1122 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1123 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1124 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1125 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1126 int ncopies = vf / nunits;
1128 if (DR_IS_READ (dr))
1129 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1130 NULL, body_cost_vec, false);
1131 else
1132 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1134 if (dump_enabled_p ())
1135 dump_printf_loc (MSG_NOTE, vect_location,
1136 "vect_get_data_access_cost: inside_cost = %d, "
1137 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1141 typedef struct _vect_peel_info
1143 int npeel;
1144 struct data_reference *dr;
1145 unsigned int count;
1146 } *vect_peel_info;
1148 typedef struct _vect_peel_extended_info
1150 struct _vect_peel_info peel_info;
1151 unsigned int inside_cost;
1152 unsigned int outside_cost;
1153 stmt_vector_for_cost body_cost_vec;
1154 } *vect_peel_extended_info;
1157 /* Peeling hashtable helpers. */
1159 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1161 static inline hashval_t hash (const _vect_peel_info *);
1162 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1165 inline hashval_t
1166 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1168 return (hashval_t) peel_info->npeel;
1171 inline bool
1172 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1174 return (a->npeel == b->npeel);
1178 /* Insert DR into peeling hash table with NPEEL as key. */
1180 static void
1181 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1182 loop_vec_info loop_vinfo, struct data_reference *dr,
1183 int npeel)
1185 struct _vect_peel_info elem, *slot;
1186 _vect_peel_info **new_slot;
1187 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1189 elem.npeel = npeel;
1190 slot = peeling_htab->find (&elem);
1191 if (slot)
1192 slot->count++;
1193 else
1195 slot = XNEW (struct _vect_peel_info);
1196 slot->npeel = npeel;
1197 slot->dr = dr;
1198 slot->count = 1;
1199 new_slot = peeling_htab->find_slot (slot, INSERT);
1200 *new_slot = slot;
1203 if (!supportable_dr_alignment
1204 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1205 slot->count += VECT_MAX_COST;
1209 /* Traverse peeling hash table to find peeling option that aligns maximum
1210 number of data accesses. */
1213 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1214 _vect_peel_extended_info *max)
1216 vect_peel_info elem = *slot;
1218 if (elem->count > max->peel_info.count
1219 || (elem->count == max->peel_info.count
1220 && max->peel_info.npeel > elem->npeel))
1222 max->peel_info.npeel = elem->npeel;
1223 max->peel_info.count = elem->count;
1224 max->peel_info.dr = elem->dr;
1227 return 1;
1231 /* Traverse peeling hash table and calculate cost for each peeling option.
1232 Find the one with the lowest cost. */
1235 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1236 _vect_peel_extended_info *min)
1238 vect_peel_info elem = *slot;
1239 int save_misalignment, dummy;
1240 unsigned int inside_cost = 0, outside_cost = 0, i;
1241 gimple *stmt = DR_STMT (elem->dr);
1242 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1243 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1244 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1245 struct data_reference *dr;
1246 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1248 prologue_cost_vec.create (2);
1249 body_cost_vec.create (2);
1250 epilogue_cost_vec.create (2);
1252 FOR_EACH_VEC_ELT (datarefs, i, dr)
1254 stmt = DR_STMT (dr);
1255 stmt_info = vinfo_for_stmt (stmt);
1256 /* For interleaving, only the alignment of the first access
1257 matters. */
1258 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1259 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1260 continue;
1262 /* Strided accesses perform only component accesses, alignment is
1263 irrelevant for them. */
1264 if (STMT_VINFO_STRIDED_P (stmt_info)
1265 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1266 continue;
1268 save_misalignment = DR_MISALIGNMENT (dr);
1269 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1270 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1271 &body_cost_vec);
1272 SET_DR_MISALIGNMENT (dr, save_misalignment);
1275 outside_cost += vect_get_known_peeling_cost
1276 (loop_vinfo, elem->npeel, &dummy,
1277 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278 &prologue_cost_vec, &epilogue_cost_vec);
1280 /* Prologue and epilogue costs are added to the target model later.
1281 These costs depend only on the scalar iteration cost, the
1282 number of peeling iterations finally chosen, and the number of
1283 misaligned statements. So discard the information found here. */
1284 prologue_cost_vec.release ();
1285 epilogue_cost_vec.release ();
1287 if (inside_cost < min->inside_cost
1288 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1290 min->inside_cost = inside_cost;
1291 min->outside_cost = outside_cost;
1292 min->body_cost_vec.release ();
1293 min->body_cost_vec = body_cost_vec;
1294 min->peel_info.dr = elem->dr;
1295 min->peel_info.npeel = elem->npeel;
1297 else
1298 body_cost_vec.release ();
1300 return 1;
1304 /* Choose best peeling option by traversing peeling hash table and either
1305 choosing an option with the lowest cost (if cost model is enabled) or the
1306 option that aligns as many accesses as possible. */
1308 static struct data_reference *
1309 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1310 loop_vec_info loop_vinfo,
1311 unsigned int *npeel,
1312 stmt_vector_for_cost *body_cost_vec)
1314 struct _vect_peel_extended_info res;
1316 res.peel_info.dr = NULL;
1317 res.body_cost_vec = stmt_vector_for_cost ();
1319 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1321 res.inside_cost = INT_MAX;
1322 res.outside_cost = INT_MAX;
1323 peeling_htab->traverse <_vect_peel_extended_info *,
1324 vect_peeling_hash_get_lowest_cost> (&res);
1326 else
1328 res.peel_info.count = 0;
1329 peeling_htab->traverse <_vect_peel_extended_info *,
1330 vect_peeling_hash_get_most_frequent> (&res);
1333 *npeel = res.peel_info.npeel;
1334 *body_cost_vec = res.body_cost_vec;
1335 return res.peel_info.dr;
1339 /* Function vect_enhance_data_refs_alignment
1341 This pass will use loop versioning and loop peeling in order to enhance
1342 the alignment of data references in the loop.
1344 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1345 original loop is to be vectorized. Any other loops that are created by
1346 the transformations performed in this pass - are not supposed to be
1347 vectorized. This restriction will be relaxed.
1349 This pass will require a cost model to guide it whether to apply peeling
1350 or versioning or a combination of the two. For example, the scheme that
1351 intel uses when given a loop with several memory accesses, is as follows:
1352 choose one memory access ('p') which alignment you want to force by doing
1353 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1354 other accesses are not necessarily aligned, or (2) use loop versioning to
1355 generate one loop in which all accesses are aligned, and another loop in
1356 which only 'p' is necessarily aligned.
1358 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1359 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1360 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1362 Devising a cost model is the most critical aspect of this work. It will
1363 guide us on which access to peel for, whether to use loop versioning, how
1364 many versions to create, etc. The cost model will probably consist of
1365 generic considerations as well as target specific considerations (on
1366 powerpc for example, misaligned stores are more painful than misaligned
1367 loads).
1369 Here are the general steps involved in alignment enhancements:
1371 -- original loop, before alignment analysis:
1372 for (i=0; i<N; i++){
1373 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1374 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1377 -- After vect_compute_data_refs_alignment:
1378 for (i=0; i<N; i++){
1379 x = q[i]; # DR_MISALIGNMENT(q) = 3
1380 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1383 -- Possibility 1: we do loop versioning:
1384 if (p is aligned) {
1385 for (i=0; i<N; i++){ # loop 1A
1386 x = q[i]; # DR_MISALIGNMENT(q) = 3
1387 p[i] = y; # DR_MISALIGNMENT(p) = 0
1390 else {
1391 for (i=0; i<N; i++){ # loop 1B
1392 x = q[i]; # DR_MISALIGNMENT(q) = 3
1393 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1397 -- Possibility 2: we do loop peeling:
1398 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1399 x = q[i];
1400 p[i] = y;
1402 for (i = 3; i < N; i++){ # loop 2A
1403 x = q[i]; # DR_MISALIGNMENT(q) = 0
1404 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1407 -- Possibility 3: combination of loop peeling and versioning:
1408 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1409 x = q[i];
1410 p[i] = y;
1412 if (p is aligned) {
1413 for (i = 3; i<N; i++){ # loop 3A
1414 x = q[i]; # DR_MISALIGNMENT(q) = 0
1415 p[i] = y; # DR_MISALIGNMENT(p) = 0
1418 else {
1419 for (i = 3; i<N; i++){ # loop 3B
1420 x = q[i]; # DR_MISALIGNMENT(q) = 0
1421 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1425 These loops are later passed to loop_transform to be vectorized. The
1426 vectorizer will use the alignment information to guide the transformation
1427 (whether to generate regular loads/stores, or with special handling for
1428 misalignment). */
1430 bool
1431 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1433 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1434 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1435 enum dr_alignment_support supportable_dr_alignment;
1436 struct data_reference *dr0 = NULL, *first_store = NULL;
1437 struct data_reference *dr;
1438 unsigned int i, j;
1439 bool do_peeling = false;
1440 bool do_versioning = false;
1441 bool stat;
1442 gimple *stmt;
1443 stmt_vec_info stmt_info;
1444 unsigned int npeel = 0;
1445 bool all_misalignments_unknown = true;
1446 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1447 unsigned possible_npeel_number = 1;
1448 tree vectype;
1449 unsigned int nelements, mis, same_align_drs_max = 0;
1450 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1451 hash_table<peel_info_hasher> peeling_htab (1);
1453 if (dump_enabled_p ())
1454 dump_printf_loc (MSG_NOTE, vect_location,
1455 "=== vect_enhance_data_refs_alignment ===\n");
1457 /* Reset data so we can safely be called multiple times. */
1458 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1459 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1461 /* While cost model enhancements are expected in the future, the high level
1462 view of the code at this time is as follows:
1464 A) If there is a misaligned access then see if peeling to align
1465 this access can make all data references satisfy
1466 vect_supportable_dr_alignment. If so, update data structures
1467 as needed and return true.
1469 B) If peeling wasn't possible and there is a data reference with an
1470 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1471 then see if loop versioning checks can be used to make all data
1472 references satisfy vect_supportable_dr_alignment. If so, update
1473 data structures as needed and return true.
1475 C) If neither peeling nor versioning were successful then return false if
1476 any data reference does not satisfy vect_supportable_dr_alignment.
1478 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1480 Note, Possibility 3 above (which is peeling and versioning together) is not
1481 being done at this time. */
1483 /* (1) Peeling to force alignment. */
1485 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1486 Considerations:
1487 + How many accesses will become aligned due to the peeling
1488 - How many accesses will become unaligned due to the peeling,
1489 and the cost of misaligned accesses.
1490 - The cost of peeling (the extra runtime checks, the increase
1491 in code size). */
1493 FOR_EACH_VEC_ELT (datarefs, i, dr)
1495 stmt = DR_STMT (dr);
1496 stmt_info = vinfo_for_stmt (stmt);
1498 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1499 continue;
1501 /* For interleaving, only the alignment of the first access
1502 matters. */
1503 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1504 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1505 continue;
1507 /* For invariant accesses there is nothing to enhance. */
1508 if (integer_zerop (DR_STEP (dr)))
1509 continue;
1511 /* Strided accesses perform only component accesses, alignment is
1512 irrelevant for them. */
1513 if (STMT_VINFO_STRIDED_P (stmt_info)
1514 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1515 continue;
1517 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1518 do_peeling = vector_alignment_reachable_p (dr);
1519 if (do_peeling)
1521 if (known_alignment_for_access_p (dr))
1523 unsigned int npeel_tmp;
1524 bool negative = tree_int_cst_compare (DR_STEP (dr),
1525 size_zero_node) < 0;
1527 /* Save info about DR in the hash table. */
1528 vectype = STMT_VINFO_VECTYPE (stmt_info);
1529 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1530 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1531 TREE_TYPE (DR_REF (dr))));
1532 npeel_tmp = (negative
1533 ? (mis - nelements) : (nelements - mis))
1534 & (nelements - 1);
1536 /* For multiple types, it is possible that the bigger type access
1537 will have more than one peeling option. E.g., a loop with two
1538 types: one of size (vector size / 4), and the other one of
1539 size (vector size / 8). Vectorization factor will 8. If both
1540 access are misaligned by 3, the first one needs one scalar
1541 iteration to be aligned, and the second one needs 5. But the
1542 first one will be aligned also by peeling 5 scalar
1543 iterations, and in that case both accesses will be aligned.
1544 Hence, except for the immediate peeling amount, we also want
1545 to try to add full vector size, while we don't exceed
1546 vectorization factor.
1547 We do this automtically for cost model, since we calculate cost
1548 for every peeling option. */
1549 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1551 if (STMT_SLP_TYPE (stmt_info))
1552 possible_npeel_number
1553 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1554 else
1555 possible_npeel_number = vf / nelements;
1558 /* Handle the aligned case. We may decide to align some other
1559 access, making DR unaligned. */
1560 if (DR_MISALIGNMENT (dr) == 0)
1562 npeel_tmp = 0;
1563 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1564 possible_npeel_number++;
1567 for (j = 0; j < possible_npeel_number; j++)
1569 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1570 dr, npeel_tmp);
1571 npeel_tmp += nelements;
1574 all_misalignments_unknown = false;
1575 /* Data-ref that was chosen for the case that all the
1576 misalignments are unknown is not relevant anymore, since we
1577 have a data-ref with known alignment. */
1578 dr0 = NULL;
1580 else
1582 /* If we don't know any misalignment values, we prefer
1583 peeling for data-ref that has the maximum number of data-refs
1584 with the same alignment, unless the target prefers to align
1585 stores over load. */
1586 if (all_misalignments_unknown)
1588 unsigned same_align_drs
1589 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1590 if (!dr0
1591 || same_align_drs_max < same_align_drs)
1593 same_align_drs_max = same_align_drs;
1594 dr0 = dr;
1596 /* For data-refs with the same number of related
1597 accesses prefer the one where the misalign
1598 computation will be invariant in the outermost loop. */
1599 else if (same_align_drs_max == same_align_drs)
1601 struct loop *ivloop0, *ivloop;
1602 ivloop0 = outermost_invariant_loop_for_expr
1603 (loop, DR_BASE_ADDRESS (dr0));
1604 ivloop = outermost_invariant_loop_for_expr
1605 (loop, DR_BASE_ADDRESS (dr));
1606 if ((ivloop && !ivloop0)
1607 || (ivloop && ivloop0
1608 && flow_loop_nested_p (ivloop, ivloop0)))
1609 dr0 = dr;
1612 if (!first_store && DR_IS_WRITE (dr))
1613 first_store = dr;
1616 /* If there are both known and unknown misaligned accesses in the
1617 loop, we choose peeling amount according to the known
1618 accesses. */
1619 if (!supportable_dr_alignment)
1621 dr0 = dr;
1622 if (!first_store && DR_IS_WRITE (dr))
1623 first_store = dr;
1627 else
1629 if (!aligned_access_p (dr))
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 "vector alignment may not be reachable\n");
1634 break;
1639 /* Check if we can possibly peel the loop. */
1640 if (!vect_can_advance_ivs_p (loop_vinfo)
1641 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1642 || loop->inner)
1643 do_peeling = false;
1645 if (do_peeling
1646 && all_misalignments_unknown
1647 && vect_supportable_dr_alignment (dr0, false))
1649 /* Check if the target requires to prefer stores over loads, i.e., if
1650 misaligned stores are more expensive than misaligned loads (taking
1651 drs with same alignment into account). */
1652 if (first_store && DR_IS_READ (dr0))
1654 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1655 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1656 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1657 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1658 stmt_vector_for_cost dummy;
1659 dummy.create (2);
1661 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1662 &dummy);
1663 vect_get_data_access_cost (first_store, &store_inside_cost,
1664 &store_outside_cost, &dummy);
1666 dummy.release ();
1668 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1669 aligning the load DR0). */
1670 load_inside_penalty = store_inside_cost;
1671 load_outside_penalty = store_outside_cost;
1672 for (i = 0;
1673 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1674 DR_STMT (first_store))).iterate (i, &dr);
1675 i++)
1676 if (DR_IS_READ (dr))
1678 load_inside_penalty += load_inside_cost;
1679 load_outside_penalty += load_outside_cost;
1681 else
1683 load_inside_penalty += store_inside_cost;
1684 load_outside_penalty += store_outside_cost;
1687 /* Calculate the penalty for leaving DR0 unaligned (by
1688 aligning the FIRST_STORE). */
1689 store_inside_penalty = load_inside_cost;
1690 store_outside_penalty = load_outside_cost;
1691 for (i = 0;
1692 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1693 DR_STMT (dr0))).iterate (i, &dr);
1694 i++)
1695 if (DR_IS_READ (dr))
1697 store_inside_penalty += load_inside_cost;
1698 store_outside_penalty += load_outside_cost;
1700 else
1702 store_inside_penalty += store_inside_cost;
1703 store_outside_penalty += store_outside_cost;
1706 if (load_inside_penalty > store_inside_penalty
1707 || (load_inside_penalty == store_inside_penalty
1708 && load_outside_penalty > store_outside_penalty))
1709 dr0 = first_store;
1712 /* In case there are only loads with different unknown misalignments, use
1713 peeling only if it may help to align other accesses in the loop or
1714 if it may help improving load bandwith when we'd end up using
1715 unaligned loads. */
1716 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1717 if (!first_store
1718 && !STMT_VINFO_SAME_ALIGN_REFS (
1719 vinfo_for_stmt (DR_STMT (dr0))).length ()
1720 && (vect_supportable_dr_alignment (dr0, false)
1721 != dr_unaligned_supported
1722 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1723 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1724 do_peeling = false;
1727 if (do_peeling && !dr0)
1729 /* Peeling is possible, but there is no data access that is not supported
1730 unless aligned. So we try to choose the best possible peeling. */
1732 /* We should get here only if there are drs with known misalignment. */
1733 gcc_assert (!all_misalignments_unknown);
1735 /* Choose the best peeling from the hash table. */
1736 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1737 loop_vinfo, &npeel,
1738 &body_cost_vec);
1739 if (!dr0 || !npeel)
1740 do_peeling = false;
1743 if (do_peeling)
1745 stmt = DR_STMT (dr0);
1746 stmt_info = vinfo_for_stmt (stmt);
1747 vectype = STMT_VINFO_VECTYPE (stmt_info);
1748 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1750 if (known_alignment_for_access_p (dr0))
1752 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1753 size_zero_node) < 0;
1754 if (!npeel)
1756 /* Since it's known at compile time, compute the number of
1757 iterations in the peeled loop (the peeling factor) for use in
1758 updating DR_MISALIGNMENT values. The peeling factor is the
1759 vectorization factor minus the misalignment as an element
1760 count. */
1761 mis = DR_MISALIGNMENT (dr0);
1762 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1763 npeel = ((negative ? mis - nelements : nelements - mis)
1764 & (nelements - 1));
1767 /* For interleaved data access every iteration accesses all the
1768 members of the group, therefore we divide the number of iterations
1769 by the group size. */
1770 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1771 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1772 npeel /= GROUP_SIZE (stmt_info);
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "Try peeling by %d\n", npeel);
1779 /* Ensure that all data refs can be vectorized after the peel. */
1780 FOR_EACH_VEC_ELT (datarefs, i, dr)
1782 int save_misalignment;
1784 if (dr == dr0)
1785 continue;
1787 stmt = DR_STMT (dr);
1788 stmt_info = vinfo_for_stmt (stmt);
1789 /* For interleaving, only the alignment of the first access
1790 matters. */
1791 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1792 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1793 continue;
1795 /* Strided accesses perform only component accesses, alignment is
1796 irrelevant for them. */
1797 if (STMT_VINFO_STRIDED_P (stmt_info)
1798 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1799 continue;
1801 save_misalignment = DR_MISALIGNMENT (dr);
1802 vect_update_misalignment_for_peel (dr, dr0, npeel);
1803 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1804 SET_DR_MISALIGNMENT (dr, save_misalignment);
1806 if (!supportable_dr_alignment)
1808 do_peeling = false;
1809 break;
1813 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1815 stat = vect_verify_datarefs_alignment (loop_vinfo);
1816 if (!stat)
1817 do_peeling = false;
1818 else
1820 body_cost_vec.release ();
1821 return stat;
1825 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1826 if (do_peeling)
1828 unsigned max_allowed_peel
1829 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1830 if (max_allowed_peel != (unsigned)-1)
1832 unsigned max_peel = npeel;
1833 if (max_peel == 0)
1835 gimple *dr_stmt = DR_STMT (dr0);
1836 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1837 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1838 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1840 if (max_peel > max_allowed_peel)
1842 do_peeling = false;
1843 if (dump_enabled_p ())
1844 dump_printf_loc (MSG_NOTE, vect_location,
1845 "Disable peeling, max peels reached: %d\n", max_peel);
1850 /* Cost model #2 - if peeling may result in a remaining loop not
1851 iterating enough to be vectorized then do not peel. */
1852 if (do_peeling
1853 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1855 unsigned max_peel
1856 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1857 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1858 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1859 do_peeling = false;
1862 if (do_peeling)
1864 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1865 If the misalignment of DR_i is identical to that of dr0 then set
1866 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1867 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1868 by the peeling factor times the element size of DR_i (MOD the
1869 vectorization factor times the size). Otherwise, the
1870 misalignment of DR_i must be set to unknown. */
1871 FOR_EACH_VEC_ELT (datarefs, i, dr)
1872 if (dr != dr0)
1874 /* Strided accesses perform only component accesses, alignment
1875 is irrelevant for them. */
1876 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1877 if (STMT_VINFO_STRIDED_P (stmt_info)
1878 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1879 continue;
1881 vect_update_misalignment_for_peel (dr, dr0, npeel);
1884 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1885 if (npeel)
1886 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1887 else
1888 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1889 = DR_MISALIGNMENT (dr0);
1890 SET_DR_MISALIGNMENT (dr0, 0);
1891 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_NOTE, vect_location,
1894 "Alignment of access forced using peeling.\n");
1895 dump_printf_loc (MSG_NOTE, vect_location,
1896 "Peeling for alignment will be applied.\n");
1898 /* The inside-loop cost will be accounted for in vectorizable_load
1899 and vectorizable_store correctly with adjusted alignments.
1900 Drop the body_cst_vec on the floor here. */
1901 body_cost_vec.release ();
1903 stat = vect_verify_datarefs_alignment (loop_vinfo);
1904 gcc_assert (stat);
1905 return stat;
1909 body_cost_vec.release ();
1911 /* (2) Versioning to force alignment. */
1913 /* Try versioning if:
1914 1) optimize loop for speed
1915 2) there is at least one unsupported misaligned data ref with an unknown
1916 misalignment, and
1917 3) all misaligned data refs with a known misalignment are supported, and
1918 4) the number of runtime alignment checks is within reason. */
1920 do_versioning =
1921 optimize_loop_nest_for_speed_p (loop)
1922 && (!loop->inner); /* FORNOW */
1924 if (do_versioning)
1926 FOR_EACH_VEC_ELT (datarefs, i, dr)
1928 stmt = DR_STMT (dr);
1929 stmt_info = vinfo_for_stmt (stmt);
1931 /* For interleaving, only the alignment of the first access
1932 matters. */
1933 if (aligned_access_p (dr)
1934 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1935 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1936 continue;
1938 if (STMT_VINFO_STRIDED_P (stmt_info))
1940 /* Strided loads perform only component accesses, alignment is
1941 irrelevant for them. */
1942 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1943 continue;
1944 do_versioning = false;
1945 break;
1948 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1950 if (!supportable_dr_alignment)
1952 gimple *stmt;
1953 int mask;
1954 tree vectype;
1956 if (known_alignment_for_access_p (dr)
1957 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1958 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1960 do_versioning = false;
1961 break;
1964 stmt = DR_STMT (dr);
1965 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1966 gcc_assert (vectype);
1968 /* The rightmost bits of an aligned address must be zeros.
1969 Construct the mask needed for this test. For example,
1970 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1971 mask must be 15 = 0xf. */
1972 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1974 /* FORNOW: use the same mask to test all potentially unaligned
1975 references in the loop. The vectorizer currently supports
1976 a single vector size, see the reference to
1977 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1978 vectorization factor is computed. */
1979 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1980 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1981 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1982 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1983 DR_STMT (dr));
1987 /* Versioning requires at least one misaligned data reference. */
1988 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1989 do_versioning = false;
1990 else if (!do_versioning)
1991 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1994 if (do_versioning)
1996 vec<gimple *> may_misalign_stmts
1997 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1998 gimple *stmt;
2000 /* It can now be assumed that the data references in the statements
2001 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2002 of the loop being vectorized. */
2003 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2005 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2006 dr = STMT_VINFO_DATA_REF (stmt_info);
2007 SET_DR_MISALIGNMENT (dr, 0);
2008 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_NOTE, vect_location,
2010 "Alignment of access forced using versioning.\n");
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE, vect_location,
2015 "Versioning for alignment will be applied.\n");
2017 /* Peeling and versioning can't be done together at this time. */
2018 gcc_assert (! (do_peeling && do_versioning));
2020 stat = vect_verify_datarefs_alignment (loop_vinfo);
2021 gcc_assert (stat);
2022 return stat;
2025 /* This point is reached if neither peeling nor versioning is being done. */
2026 gcc_assert (! (do_peeling || do_versioning));
2028 stat = vect_verify_datarefs_alignment (loop_vinfo);
2029 return stat;
2033 /* Function vect_find_same_alignment_drs.
2035 Update group and alignment relations according to the chosen
2036 vectorization factor. */
2038 static void
2039 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2040 loop_vec_info loop_vinfo)
2042 unsigned int i;
2043 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2044 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2045 struct data_reference *dra = DDR_A (ddr);
2046 struct data_reference *drb = DDR_B (ddr);
2047 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2048 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2049 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2050 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2051 lambda_vector dist_v;
2052 unsigned int loop_depth;
2054 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2055 return;
2057 if (dra == drb)
2058 return;
2060 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2061 return;
2063 /* Loop-based vectorization and known data dependence. */
2064 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2065 return;
2067 /* Data-dependence analysis reports a distance vector of zero
2068 for data-references that overlap only in the first iteration
2069 but have different sign step (see PR45764).
2070 So as a sanity check require equal DR_STEP. */
2071 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2072 return;
2074 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2075 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2077 int dist = dist_v[loop_depth];
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_NOTE, vect_location,
2081 "dependence distance = %d.\n", dist);
2083 /* Same loop iteration. */
2084 if (dist == 0
2085 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2087 /* Two references with distance zero have the same alignment. */
2088 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2089 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2090 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE, vect_location,
2093 "accesses have the same alignment.\n");
2094 dump_printf (MSG_NOTE,
2095 "dependence distance modulo vf == 0 between ");
2096 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2097 dump_printf (MSG_NOTE, " and ");
2098 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2099 dump_printf (MSG_NOTE, "\n");
2106 /* Function vect_analyze_data_refs_alignment
2108 Analyze the alignment of the data-references in the loop.
2109 Return FALSE if a data reference is found that cannot be vectorized. */
2111 bool
2112 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_NOTE, vect_location,
2116 "=== vect_analyze_data_refs_alignment ===\n");
2118 /* Mark groups of data references with same alignment using
2119 data dependence information. */
2120 vec<ddr_p> ddrs = vinfo->ddrs;
2121 struct data_dependence_relation *ddr;
2122 unsigned int i;
2124 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2125 vect_find_same_alignment_drs (ddr, vinfo);
2127 vec<data_reference_p> datarefs = vinfo->datarefs;
2128 struct data_reference *dr;
2130 FOR_EACH_VEC_ELT (datarefs, i, dr)
2132 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2133 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2134 && !vect_compute_data_ref_alignment (dr))
2136 /* Strided accesses perform only component accesses, misalignment
2137 information is irrelevant for them. */
2138 if (STMT_VINFO_STRIDED_P (stmt_info)
2139 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2140 continue;
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 "not vectorized: can't calculate alignment "
2145 "for data ref.\n");
2147 return false;
2151 return true;
2155 /* Analyze alignment of DRs of stmts in NODE. */
2157 static bool
2158 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2160 /* We vectorize from the first scalar stmt in the node unless
2161 the node is permuted in which case we start from the first
2162 element in the group. */
2163 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2164 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2165 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2166 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2168 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2169 if (! vect_compute_data_ref_alignment (dr)
2170 /* For creating the data-ref pointer we need alignment of the
2171 first element anyway. */
2172 || (dr != first_dr
2173 && ! vect_compute_data_ref_alignment (first_dr))
2174 || ! verify_data_ref_alignment (dr))
2176 if (dump_enabled_p ())
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 "not vectorized: bad data alignment in basic "
2179 "block.\n");
2180 return false;
2183 return true;
2186 /* Function vect_slp_analyze_instance_alignment
2188 Analyze the alignment of the data-references in the SLP instance.
2189 Return FALSE if a data reference is found that cannot be vectorized. */
2191 bool
2192 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2194 if (dump_enabled_p ())
2195 dump_printf_loc (MSG_NOTE, vect_location,
2196 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2198 slp_tree node;
2199 unsigned i;
2200 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2201 if (! vect_slp_analyze_and_verify_node_alignment (node))
2202 return false;
2204 node = SLP_INSTANCE_TREE (instance);
2205 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2206 && ! vect_slp_analyze_and_verify_node_alignment
2207 (SLP_INSTANCE_TREE (instance)))
2208 return false;
2210 return true;
2214 /* Analyze groups of accesses: check that DR belongs to a group of
2215 accesses of legal size, step, etc. Detect gaps, single element
2216 interleaving, and other special cases. Set grouped access info.
2217 Collect groups of strided stores for further use in SLP analysis.
2218 Worker for vect_analyze_group_access. */
2220 static bool
2221 vect_analyze_group_access_1 (struct data_reference *dr)
2223 tree step = DR_STEP (dr);
2224 tree scalar_type = TREE_TYPE (DR_REF (dr));
2225 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2226 gimple *stmt = DR_STMT (dr);
2227 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2228 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2229 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2230 HOST_WIDE_INT dr_step = -1;
2231 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2232 bool slp_impossible = false;
2234 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2235 size of the interleaving group (including gaps). */
2236 if (tree_fits_shwi_p (step))
2238 dr_step = tree_to_shwi (step);
2239 /* Check that STEP is a multiple of type size. Otherwise there is
2240 a non-element-sized gap at the end of the group which we
2241 cannot represent in GROUP_GAP or GROUP_SIZE.
2242 ??? As we can handle non-constant step fine here we should
2243 simply remove uses of GROUP_GAP between the last and first
2244 element and instead rely on DR_STEP. GROUP_SIZE then would
2245 simply not include that gap. */
2246 if ((dr_step % type_size) != 0)
2248 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "Step ");
2252 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2253 dump_printf (MSG_NOTE,
2254 " is not a multiple of the element size for ");
2255 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2256 dump_printf (MSG_NOTE, "\n");
2258 return false;
2260 groupsize = absu_hwi (dr_step) / type_size;
2262 else
2263 groupsize = 0;
2265 /* Not consecutive access is possible only if it is a part of interleaving. */
2266 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2268 /* Check if it this DR is a part of interleaving, and is a single
2269 element of the group that is accessed in the loop. */
2271 /* Gaps are supported only for loads. STEP must be a multiple of the type
2272 size. The size of the group must be a power of 2. */
2273 if (DR_IS_READ (dr)
2274 && (dr_step % type_size) == 0
2275 && groupsize > 0
2276 && pow2p_hwi (groupsize))
2278 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2279 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2280 GROUP_GAP (stmt_info) = groupsize - 1;
2281 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_NOTE, vect_location,
2284 "Detected single element interleaving ");
2285 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2286 dump_printf (MSG_NOTE, " step ");
2287 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2288 dump_printf (MSG_NOTE, "\n");
2291 return true;
2294 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297 "not consecutive access ");
2298 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2301 if (bb_vinfo)
2303 /* Mark the statement as unvectorizable. */
2304 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2305 return true;
2308 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2309 STMT_VINFO_STRIDED_P (stmt_info) = true;
2310 return true;
2313 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2315 /* First stmt in the interleaving chain. Check the chain. */
2316 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2317 struct data_reference *data_ref = dr;
2318 unsigned int count = 1;
2319 tree prev_init = DR_INIT (data_ref);
2320 gimple *prev = stmt;
2321 HOST_WIDE_INT diff, gaps = 0;
2323 while (next)
2325 /* Skip same data-refs. In case that two or more stmts share
2326 data-ref (supported only for loads), we vectorize only the first
2327 stmt, and the rest get their vectorized loads from the first
2328 one. */
2329 if (!tree_int_cst_compare (DR_INIT (data_ref),
2330 DR_INIT (STMT_VINFO_DATA_REF (
2331 vinfo_for_stmt (next)))))
2333 if (DR_IS_WRITE (data_ref))
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 "Two store stmts share the same dr.\n");
2338 return false;
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "Two or more load stmts share the same dr.\n");
2345 /* For load use the same data-ref load. */
2346 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2348 prev = next;
2349 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2350 continue;
2353 prev = next;
2354 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2356 /* All group members have the same STEP by construction. */
2357 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2359 /* Check that the distance between two accesses is equal to the type
2360 size. Otherwise, we have gaps. */
2361 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2362 - TREE_INT_CST_LOW (prev_init)) / type_size;
2363 if (diff != 1)
2365 /* FORNOW: SLP of accesses with gaps is not supported. */
2366 slp_impossible = true;
2367 if (DR_IS_WRITE (data_ref))
2369 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371 "interleaved store with gaps\n");
2372 return false;
2375 gaps += diff - 1;
2378 last_accessed_element += diff;
2380 /* Store the gap from the previous member of the group. If there is no
2381 gap in the access, GROUP_GAP is always 1. */
2382 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2384 prev_init = DR_INIT (data_ref);
2385 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2386 /* Count the number of data-refs in the chain. */
2387 count++;
2390 if (groupsize == 0)
2391 groupsize = count + gaps;
2393 /* This could be UINT_MAX but as we are generating code in a very
2394 inefficient way we have to cap earlier. See PR78699 for example. */
2395 if (groupsize > 4096)
2397 if (dump_enabled_p ())
2398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2399 "group is too large\n");
2400 return false;
2403 /* Check that the size of the interleaving is equal to count for stores,
2404 i.e., that there are no gaps. */
2405 if (groupsize != count
2406 && !DR_IS_READ (dr))
2408 if (dump_enabled_p ())
2409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2410 "interleaved store with gaps\n");
2411 return false;
2414 /* If there is a gap after the last load in the group it is the
2415 difference between the groupsize and the last accessed
2416 element.
2417 When there is no gap, this difference should be 0. */
2418 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2420 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2421 if (dump_enabled_p ())
2423 dump_printf_loc (MSG_NOTE, vect_location,
2424 "Detected interleaving ");
2425 if (DR_IS_READ (dr))
2426 dump_printf (MSG_NOTE, "load ");
2427 else
2428 dump_printf (MSG_NOTE, "store ");
2429 dump_printf (MSG_NOTE, "of size %u starting with ",
2430 (unsigned)groupsize);
2431 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2432 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2433 dump_printf_loc (MSG_NOTE, vect_location,
2434 "There is a gap of %u elements after the group\n",
2435 GROUP_GAP (vinfo_for_stmt (stmt)));
2438 /* SLP: create an SLP data structure for every interleaving group of
2439 stores for further analysis in vect_analyse_slp. */
2440 if (DR_IS_WRITE (dr) && !slp_impossible)
2442 if (loop_vinfo)
2443 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2444 if (bb_vinfo)
2445 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2449 return true;
2452 /* Analyze groups of accesses: check that DR belongs to a group of
2453 accesses of legal size, step, etc. Detect gaps, single element
2454 interleaving, and other special cases. Set grouped access info.
2455 Collect groups of strided stores for further use in SLP analysis. */
2457 static bool
2458 vect_analyze_group_access (struct data_reference *dr)
2460 if (!vect_analyze_group_access_1 (dr))
2462 /* Dissolve the group if present. */
2463 gimple *next;
2464 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2465 while (stmt)
2467 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2468 next = GROUP_NEXT_ELEMENT (vinfo);
2469 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2470 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2471 stmt = next;
2473 return false;
2475 return true;
2478 /* Analyze the access pattern of the data-reference DR.
2479 In case of non-consecutive accesses call vect_analyze_group_access() to
2480 analyze groups of accesses. */
2482 static bool
2483 vect_analyze_data_ref_access (struct data_reference *dr)
2485 tree step = DR_STEP (dr);
2486 tree scalar_type = TREE_TYPE (DR_REF (dr));
2487 gimple *stmt = DR_STMT (dr);
2488 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2489 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2490 struct loop *loop = NULL;
2492 if (loop_vinfo)
2493 loop = LOOP_VINFO_LOOP (loop_vinfo);
2495 if (loop_vinfo && !step)
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499 "bad data-ref access in loop\n");
2500 return false;
2503 /* Allow loads with zero step in inner-loop vectorization. */
2504 if (loop_vinfo && integer_zerop (step))
2506 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2507 if (!nested_in_vect_loop_p (loop, stmt))
2508 return DR_IS_READ (dr);
2509 /* Allow references with zero step for outer loops marked
2510 with pragma omp simd only - it guarantees absence of
2511 loop-carried dependencies between inner loop iterations. */
2512 if (!loop->force_vectorize)
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location,
2516 "zero step in inner loop of nest\n");
2517 return false;
2521 if (loop && nested_in_vect_loop_p (loop, stmt))
2523 /* Interleaved accesses are not yet supported within outer-loop
2524 vectorization for references in the inner-loop. */
2525 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2527 /* For the rest of the analysis we use the outer-loop step. */
2528 step = STMT_VINFO_DR_STEP (stmt_info);
2529 if (integer_zerop (step))
2531 if (dump_enabled_p ())
2532 dump_printf_loc (MSG_NOTE, vect_location,
2533 "zero step in outer loop.\n");
2534 return DR_IS_READ (dr);
2538 /* Consecutive? */
2539 if (TREE_CODE (step) == INTEGER_CST)
2541 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2542 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2543 || (dr_step < 0
2544 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2546 /* Mark that it is not interleaving. */
2547 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2548 return true;
2552 if (loop && nested_in_vect_loop_p (loop, stmt))
2554 if (dump_enabled_p ())
2555 dump_printf_loc (MSG_NOTE, vect_location,
2556 "grouped access in outer loop.\n");
2557 return false;
2561 /* Assume this is a DR handled by non-constant strided load case. */
2562 if (TREE_CODE (step) != INTEGER_CST)
2563 return (STMT_VINFO_STRIDED_P (stmt_info)
2564 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2565 || vect_analyze_group_access (dr)));
2567 /* Not consecutive access - check if it's a part of interleaving group. */
2568 return vect_analyze_group_access (dr);
2573 /* A helper function used in the comparator function to sort data
2574 references. T1 and T2 are two data references to be compared.
2575 The function returns -1, 0, or 1. */
2577 static int
2578 compare_tree (tree t1, tree t2)
2580 int i, cmp;
2581 enum tree_code code;
2582 char tclass;
2584 if (t1 == t2)
2585 return 0;
2586 if (t1 == NULL)
2587 return -1;
2588 if (t2 == NULL)
2589 return 1;
2591 STRIP_NOPS (t1);
2592 STRIP_NOPS (t2);
2594 if (TREE_CODE (t1) != TREE_CODE (t2))
2595 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2597 code = TREE_CODE (t1);
2598 switch (code)
2600 /* For const values, we can just use hash values for comparisons. */
2601 case INTEGER_CST:
2602 case REAL_CST:
2603 case FIXED_CST:
2604 case STRING_CST:
2605 case COMPLEX_CST:
2606 case VECTOR_CST:
2608 hashval_t h1 = iterative_hash_expr (t1, 0);
2609 hashval_t h2 = iterative_hash_expr (t2, 0);
2610 if (h1 != h2)
2611 return h1 < h2 ? -1 : 1;
2612 break;
2615 case SSA_NAME:
2616 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2617 if (cmp != 0)
2618 return cmp;
2620 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2621 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2622 break;
2624 default:
2625 tclass = TREE_CODE_CLASS (code);
2627 /* For var-decl, we could compare their UIDs. */
2628 if (tclass == tcc_declaration)
2630 if (DECL_UID (t1) != DECL_UID (t2))
2631 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2632 break;
2635 /* For expressions with operands, compare their operands recursively. */
2636 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2638 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2639 if (cmp != 0)
2640 return cmp;
2644 return 0;
2648 /* Compare two data-references DRA and DRB to group them into chunks
2649 suitable for grouping. */
2651 static int
2652 dr_group_sort_cmp (const void *dra_, const void *drb_)
2654 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2655 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2656 int cmp;
2658 /* Stabilize sort. */
2659 if (dra == drb)
2660 return 0;
2662 /* DRs in different loops never belong to the same group. */
2663 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2664 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2665 if (loopa != loopb)
2666 return loopa->num < loopb->num ? -1 : 1;
2668 /* Ordering of DRs according to base. */
2669 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2671 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2672 if (cmp != 0)
2673 return cmp;
2676 /* And according to DR_OFFSET. */
2677 if (!dr_equal_offsets_p (dra, drb))
2679 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2680 if (cmp != 0)
2681 return cmp;
2684 /* Put reads before writes. */
2685 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2686 return DR_IS_READ (dra) ? -1 : 1;
2688 /* Then sort after access size. */
2689 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2690 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2692 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2693 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2694 if (cmp != 0)
2695 return cmp;
2698 /* And after step. */
2699 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2701 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2702 if (cmp != 0)
2703 return cmp;
2706 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2707 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2708 if (cmp == 0)
2709 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2710 return cmp;
2713 /* Function vect_analyze_data_ref_accesses.
2715 Analyze the access pattern of all the data references in the loop.
2717 FORNOW: the only access pattern that is considered vectorizable is a
2718 simple step 1 (consecutive) access.
2720 FORNOW: handle only arrays and pointer accesses. */
2722 bool
2723 vect_analyze_data_ref_accesses (vec_info *vinfo)
2725 unsigned int i;
2726 vec<data_reference_p> datarefs = vinfo->datarefs;
2727 struct data_reference *dr;
2729 if (dump_enabled_p ())
2730 dump_printf_loc (MSG_NOTE, vect_location,
2731 "=== vect_analyze_data_ref_accesses ===\n");
2733 if (datarefs.is_empty ())
2734 return true;
2736 /* Sort the array of datarefs to make building the interleaving chains
2737 linear. Don't modify the original vector's order, it is needed for
2738 determining what dependencies are reversed. */
2739 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2740 datarefs_copy.qsort (dr_group_sort_cmp);
2742 /* Build the interleaving chains. */
2743 for (i = 0; i < datarefs_copy.length () - 1;)
2745 data_reference_p dra = datarefs_copy[i];
2746 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2747 stmt_vec_info lastinfo = NULL;
2748 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2750 ++i;
2751 continue;
2753 for (i = i + 1; i < datarefs_copy.length (); ++i)
2755 data_reference_p drb = datarefs_copy[i];
2756 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2757 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2758 break;
2760 /* ??? Imperfect sorting (non-compatible types, non-modulo
2761 accesses, same accesses) can lead to a group to be artificially
2762 split here as we don't just skip over those. If it really
2763 matters we can push those to a worklist and re-iterate
2764 over them. The we can just skip ahead to the next DR here. */
2766 /* DRs in a different loop should not be put into the same
2767 interleaving group. */
2768 if (gimple_bb (DR_STMT (dra))->loop_father
2769 != gimple_bb (DR_STMT (drb))->loop_father)
2770 break;
2772 /* Check that the data-refs have same first location (except init)
2773 and they are both either store or load (not load and store,
2774 not masked loads or stores). */
2775 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2776 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2777 DR_BASE_ADDRESS (drb), 0)
2778 || !dr_equal_offsets_p (dra, drb)
2779 || !gimple_assign_single_p (DR_STMT (dra))
2780 || !gimple_assign_single_p (DR_STMT (drb)))
2781 break;
2783 /* Check that the data-refs have the same constant size. */
2784 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2785 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2786 if (!tree_fits_uhwi_p (sza)
2787 || !tree_fits_uhwi_p (szb)
2788 || !tree_int_cst_equal (sza, szb))
2789 break;
2791 /* Check that the data-refs have the same step. */
2792 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2793 break;
2795 /* Do not place the same access in the interleaving chain twice. */
2796 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2797 break;
2799 /* Check the types are compatible.
2800 ??? We don't distinguish this during sorting. */
2801 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2802 TREE_TYPE (DR_REF (drb))))
2803 break;
2805 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2806 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2807 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2808 gcc_assert (init_a <= init_b);
2810 /* If init_b == init_a + the size of the type * k, we have an
2811 interleaving, and DRA is accessed before DRB. */
2812 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2813 if (type_size_a == 0
2814 || (init_b - init_a) % type_size_a != 0)
2815 break;
2817 /* If we have a store, the accesses are adjacent. This splits
2818 groups into chunks we support (we don't support vectorization
2819 of stores with gaps). */
2820 if (!DR_IS_READ (dra)
2821 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2822 (DR_INIT (datarefs_copy[i-1]))
2823 != type_size_a))
2824 break;
2826 /* If the step (if not zero or non-constant) is greater than the
2827 difference between data-refs' inits this splits groups into
2828 suitable sizes. */
2829 if (tree_fits_shwi_p (DR_STEP (dra)))
2831 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2832 if (step != 0 && step <= (init_b - init_a))
2833 break;
2836 if (dump_enabled_p ())
2838 dump_printf_loc (MSG_NOTE, vect_location,
2839 "Detected interleaving ");
2840 if (DR_IS_READ (dra))
2841 dump_printf (MSG_NOTE, "load ");
2842 else
2843 dump_printf (MSG_NOTE, "store ");
2844 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2845 dump_printf (MSG_NOTE, " and ");
2846 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2847 dump_printf (MSG_NOTE, "\n");
2850 /* Link the found element into the group list. */
2851 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2853 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2854 lastinfo = stmtinfo_a;
2856 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2857 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2858 lastinfo = stmtinfo_b;
2862 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2863 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2864 && !vect_analyze_data_ref_access (dr))
2866 if (dump_enabled_p ())
2867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2868 "not vectorized: complicated access pattern.\n");
2870 if (is_a <bb_vec_info> (vinfo))
2872 /* Mark the statement as not vectorizable. */
2873 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2874 continue;
2876 else
2878 datarefs_copy.release ();
2879 return false;
2883 datarefs_copy.release ();
2884 return true;
2888 /* Operator == between two dr_with_seg_len objects.
2890 This equality operator is used to make sure two data refs
2891 are the same one so that we will consider to combine the
2892 aliasing checks of those two pairs of data dependent data
2893 refs. */
2895 static bool
2896 operator == (const dr_with_seg_len& d1,
2897 const dr_with_seg_len& d2)
2899 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2900 DR_BASE_ADDRESS (d2.dr), 0)
2901 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2902 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2903 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2906 /* Function comp_dr_with_seg_len_pair.
2908 Comparison function for sorting objects of dr_with_seg_len_pair_t
2909 so that we can combine aliasing checks in one scan. */
2911 static int
2912 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2914 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2915 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2916 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2917 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2919 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2920 if a and c have the same basic address snd step, and b and d have the same
2921 address and step. Therefore, if any a&c or b&d don't have the same address
2922 and step, we don't care the order of those two pairs after sorting. */
2923 int comp_res;
2925 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2926 DR_BASE_ADDRESS (b1.dr))) != 0)
2927 return comp_res;
2928 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2929 DR_BASE_ADDRESS (b2.dr))) != 0)
2930 return comp_res;
2931 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2932 return comp_res;
2933 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2934 return comp_res;
2935 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2936 return comp_res;
2937 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2938 return comp_res;
2939 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2940 return comp_res;
2941 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2942 return comp_res;
2944 return 0;
2947 /* Function vect_vfa_segment_size.
2949 Create an expression that computes the size of segment
2950 that will be accessed for a data reference. The functions takes into
2951 account that realignment loads may access one more vector.
2953 Input:
2954 DR: The data reference.
2955 LENGTH_FACTOR: segment length to consider.
2957 Return an expression whose value is the size of segment which will be
2958 accessed by DR. */
2960 static tree
2961 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2963 tree segment_length;
2965 if (integer_zerop (DR_STEP (dr)))
2966 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2967 else
2968 segment_length = size_binop (MULT_EXPR,
2969 fold_convert (sizetype, DR_STEP (dr)),
2970 fold_convert (sizetype, length_factor));
2972 if (vect_supportable_dr_alignment (dr, false)
2973 == dr_explicit_realign_optimized)
2975 tree vector_size = TYPE_SIZE_UNIT
2976 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2978 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2980 return segment_length;
2983 /* Function vect_no_alias_p.
2985 Given data references A and B with equal base and offset, the alias
2986 relation can be decided at compilation time, return TRUE if they do
2987 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2988 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2989 respectively. */
2991 static bool
2992 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2993 tree segment_length_a, tree segment_length_b)
2995 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2996 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2997 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2998 return false;
3000 tree seg_a_min = DR_INIT (a);
3001 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3002 seg_a_min, segment_length_a);
3003 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3004 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3005 [a, a+12) */
3006 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3008 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3009 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3010 seg_a_max, unit_size);
3011 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3012 DR_INIT (a), unit_size);
3014 tree seg_b_min = DR_INIT (b);
3015 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3016 seg_b_min, segment_length_b);
3017 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3019 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3020 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3021 seg_b_max, unit_size);
3022 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3023 DR_INIT (b), unit_size);
3026 if (tree_int_cst_le (seg_a_max, seg_b_min)
3027 || tree_int_cst_le (seg_b_max, seg_a_min))
3028 return true;
3030 return false;
3033 /* Function vect_prune_runtime_alias_test_list.
3035 Prune a list of ddrs to be tested at run-time by versioning for alias.
3036 Merge several alias checks into one if possible.
3037 Return FALSE if resulting list of ddrs is longer then allowed by
3038 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3040 bool
3041 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3043 vec<ddr_p> may_alias_ddrs =
3044 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3045 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3046 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3047 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3048 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3050 ddr_p ddr;
3051 unsigned int i;
3052 tree length_factor;
3054 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_NOTE, vect_location,
3056 "=== vect_prune_runtime_alias_test_list ===\n");
3058 if (may_alias_ddrs.is_empty ())
3059 return true;
3061 /* Basically, for each pair of dependent data refs store_ptr_0
3062 and load_ptr_0, we create an expression:
3064 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3065 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3067 for aliasing checks. However, in some cases we can decrease
3068 the number of checks by combining two checks into one. For
3069 example, suppose we have another pair of data refs store_ptr_0
3070 and load_ptr_1, and if the following condition is satisfied:
3072 load_ptr_0 < load_ptr_1 &&
3073 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3075 (this condition means, in each iteration of vectorized loop,
3076 the accessed memory of store_ptr_0 cannot be between the memory
3077 of load_ptr_0 and load_ptr_1.)
3079 we then can use only the following expression to finish the
3080 alising checks between store_ptr_0 & load_ptr_0 and
3081 store_ptr_0 & load_ptr_1:
3083 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3084 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3086 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3087 same basic address. */
3089 comp_alias_ddrs.create (may_alias_ddrs.length ());
3091 /* First, we collect all data ref pairs for aliasing checks. */
3092 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3094 int comp_res;
3095 struct data_reference *dr_a, *dr_b;
3096 gimple *dr_group_first_a, *dr_group_first_b;
3097 tree segment_length_a, segment_length_b;
3098 gimple *stmt_a, *stmt_b;
3100 dr_a = DDR_A (ddr);
3101 stmt_a = DR_STMT (DDR_A (ddr));
3102 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3103 if (dr_group_first_a)
3105 stmt_a = dr_group_first_a;
3106 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3109 dr_b = DDR_B (ddr);
3110 stmt_b = DR_STMT (DDR_B (ddr));
3111 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3112 if (dr_group_first_b)
3114 stmt_b = dr_group_first_b;
3115 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3118 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3119 length_factor = scalar_loop_iters;
3120 else
3121 length_factor = size_int (vect_factor);
3122 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3123 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3125 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3126 if (comp_res == 0)
3127 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3129 /* Alias is known at compilation time. */
3130 if (comp_res == 0
3131 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3132 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3133 && TREE_CODE (segment_length_a) == INTEGER_CST
3134 && TREE_CODE (segment_length_b) == INTEGER_CST)
3136 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3137 continue;
3139 if (dump_enabled_p ())
3140 dump_printf_loc (MSG_NOTE, vect_location,
3141 "not vectorized: compilation time alias.\n");
3143 return false;
3146 dr_with_seg_len_pair_t dr_with_seg_len_pair
3147 (dr_with_seg_len (dr_a, segment_length_a),
3148 dr_with_seg_len (dr_b, segment_length_b));
3150 /* Canonicalize pairs by sorting the two DR members. */
3151 if (comp_res > 0)
3152 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3154 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3157 /* Second, we sort the collected data ref pairs so that we can scan
3158 them once to combine all possible aliasing checks. */
3159 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3161 /* Third, we scan the sorted dr pairs and check if we can combine
3162 alias checks of two neighboring dr pairs. */
3163 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3165 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3166 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3167 *dr_b1 = &comp_alias_ddrs[i-1].second,
3168 *dr_a2 = &comp_alias_ddrs[i].first,
3169 *dr_b2 = &comp_alias_ddrs[i].second;
3171 /* Remove duplicate data ref pairs. */
3172 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3174 if (dump_enabled_p ())
3176 dump_printf_loc (MSG_NOTE, vect_location,
3177 "found equal ranges ");
3178 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3179 DR_REF (dr_a1->dr));
3180 dump_printf (MSG_NOTE, ", ");
3181 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3182 DR_REF (dr_b1->dr));
3183 dump_printf (MSG_NOTE, " and ");
3184 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3185 DR_REF (dr_a2->dr));
3186 dump_printf (MSG_NOTE, ", ");
3187 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3188 DR_REF (dr_b2->dr));
3189 dump_printf (MSG_NOTE, "\n");
3192 comp_alias_ddrs.ordered_remove (i--);
3193 continue;
3196 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3198 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3199 and DR_A1 and DR_A2 are two consecutive memrefs. */
3200 if (*dr_a1 == *dr_a2)
3202 std::swap (dr_a1, dr_b1);
3203 std::swap (dr_a2, dr_b2);
3206 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3207 DR_BASE_ADDRESS (dr_a2->dr), 0)
3208 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3209 DR_OFFSET (dr_a2->dr), 0)
3210 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3211 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3212 continue;
3214 /* Make sure dr_a1 starts left of dr_a2. */
3215 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3216 std::swap (*dr_a1, *dr_a2);
3218 bool do_remove = false;
3219 unsigned HOST_WIDE_INT diff
3220 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3221 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3223 /* If the left segment does not extend beyond the start of the
3224 right segment the new segment length is that of the right
3225 plus the segment distance. */
3226 if (tree_fits_uhwi_p (dr_a1->seg_len)
3227 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3229 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3230 size_int (diff));
3231 do_remove = true;
3233 /* Generally the new segment length is the maximum of the
3234 left segment size and the right segment size plus the distance.
3235 ??? We can also build tree MAX_EXPR here but it's not clear this
3236 is profitable. */
3237 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3238 && tree_fits_uhwi_p (dr_a2->seg_len))
3240 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3241 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3242 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3243 do_remove = true;
3245 /* Now we check if the following condition is satisfied:
3247 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3249 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3250 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3251 have to make a best estimation. We can get the minimum value
3252 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3253 then either of the following two conditions can guarantee the
3254 one above:
3256 1: DIFF <= MIN_SEG_LEN_B
3257 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3258 else
3260 unsigned HOST_WIDE_INT min_seg_len_b
3261 = (tree_fits_uhwi_p (dr_b1->seg_len)
3262 ? tree_to_uhwi (dr_b1->seg_len)
3263 : vect_factor);
3265 if (diff <= min_seg_len_b
3266 || (tree_fits_uhwi_p (dr_a1->seg_len)
3267 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3269 dr_a1->seg_len = size_binop (PLUS_EXPR,
3270 dr_a2->seg_len, size_int (diff));
3271 do_remove = true;
3275 if (do_remove)
3277 if (dump_enabled_p ())
3279 dump_printf_loc (MSG_NOTE, vect_location,
3280 "merging ranges for ");
3281 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3282 dump_printf (MSG_NOTE, ", ");
3283 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3284 dump_printf (MSG_NOTE, " and ");
3285 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3286 dump_printf (MSG_NOTE, ", ");
3287 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3288 dump_printf (MSG_NOTE, "\n");
3290 comp_alias_ddrs.ordered_remove (i--);
3295 dump_printf_loc (MSG_NOTE, vect_location,
3296 "improved number of alias checks from %d to %d\n",
3297 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3298 if ((int) comp_alias_ddrs.length () >
3299 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3301 if (dump_enabled_p ())
3302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3303 "number of versioning for alias "
3304 "run-time tests exceeds %d "
3305 "(--param vect-max-version-for-alias-checks)\n",
3306 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3307 return false;
3310 /* All alias checks have been resolved at compilation time. */
3311 if (!comp_alias_ddrs.length ())
3312 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3314 return true;
3317 /* Return true if a non-affine read or write in STMT is suitable for a
3318 gather load or scatter store. Describe the operation in *INFO if so. */
3320 bool
3321 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3322 gather_scatter_info *info)
3324 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3325 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3326 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3327 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3328 tree offtype = NULL_TREE;
3329 tree decl, base, off;
3330 machine_mode pmode;
3331 int punsignedp, reversep, pvolatilep = 0;
3333 base = DR_REF (dr);
3334 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3335 see if we can use the def stmt of the address. */
3336 if (is_gimple_call (stmt)
3337 && gimple_call_internal_p (stmt)
3338 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3339 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3340 && TREE_CODE (base) == MEM_REF
3341 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3342 && integer_zerop (TREE_OPERAND (base, 1))
3343 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3345 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3346 if (is_gimple_assign (def_stmt)
3347 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3348 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3351 /* The gather and scatter builtins need address of the form
3352 loop_invariant + vector * {1, 2, 4, 8}
3354 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3355 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3356 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3357 multiplications and additions in it. To get a vector, we need
3358 a single SSA_NAME that will be defined in the loop and will
3359 contain everything that is not loop invariant and that can be
3360 vectorized. The following code attempts to find such a preexistng
3361 SSA_NAME OFF and put the loop invariants into a tree BASE
3362 that can be gimplified before the loop. */
3363 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3364 &punsignedp, &reversep, &pvolatilep);
3365 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3367 if (TREE_CODE (base) == MEM_REF)
3369 if (!integer_zerop (TREE_OPERAND (base, 1)))
3371 if (off == NULL_TREE)
3373 offset_int moff = mem_ref_offset (base);
3374 off = wide_int_to_tree (sizetype, moff);
3376 else
3377 off = size_binop (PLUS_EXPR, off,
3378 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3380 base = TREE_OPERAND (base, 0);
3382 else
3383 base = build_fold_addr_expr (base);
3385 if (off == NULL_TREE)
3386 off = size_zero_node;
3388 /* If base is not loop invariant, either off is 0, then we start with just
3389 the constant offset in the loop invariant BASE and continue with base
3390 as OFF, otherwise give up.
3391 We could handle that case by gimplifying the addition of base + off
3392 into some SSA_NAME and use that as off, but for now punt. */
3393 if (!expr_invariant_in_loop_p (loop, base))
3395 if (!integer_zerop (off))
3396 return false;
3397 off = base;
3398 base = size_int (pbitpos / BITS_PER_UNIT);
3400 /* Otherwise put base + constant offset into the loop invariant BASE
3401 and continue with OFF. */
3402 else
3404 base = fold_convert (sizetype, base);
3405 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3408 /* OFF at this point may be either a SSA_NAME or some tree expression
3409 from get_inner_reference. Try to peel off loop invariants from it
3410 into BASE as long as possible. */
3411 STRIP_NOPS (off);
3412 while (offtype == NULL_TREE)
3414 enum tree_code code;
3415 tree op0, op1, add = NULL_TREE;
3417 if (TREE_CODE (off) == SSA_NAME)
3419 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3421 if (expr_invariant_in_loop_p (loop, off))
3422 return false;
3424 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3425 break;
3427 op0 = gimple_assign_rhs1 (def_stmt);
3428 code = gimple_assign_rhs_code (def_stmt);
3429 op1 = gimple_assign_rhs2 (def_stmt);
3431 else
3433 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3434 return false;
3435 code = TREE_CODE (off);
3436 extract_ops_from_tree (off, &code, &op0, &op1);
3438 switch (code)
3440 case POINTER_PLUS_EXPR:
3441 case PLUS_EXPR:
3442 if (expr_invariant_in_loop_p (loop, op0))
3444 add = op0;
3445 off = op1;
3446 do_add:
3447 add = fold_convert (sizetype, add);
3448 if (scale != 1)
3449 add = size_binop (MULT_EXPR, add, size_int (scale));
3450 base = size_binop (PLUS_EXPR, base, add);
3451 continue;
3453 if (expr_invariant_in_loop_p (loop, op1))
3455 add = op1;
3456 off = op0;
3457 goto do_add;
3459 break;
3460 case MINUS_EXPR:
3461 if (expr_invariant_in_loop_p (loop, op1))
3463 add = fold_convert (sizetype, op1);
3464 add = size_binop (MINUS_EXPR, size_zero_node, add);
3465 off = op0;
3466 goto do_add;
3468 break;
3469 case MULT_EXPR:
3470 if (scale == 1 && tree_fits_shwi_p (op1))
3472 scale = tree_to_shwi (op1);
3473 off = op0;
3474 continue;
3476 break;
3477 case SSA_NAME:
3478 off = op0;
3479 continue;
3480 CASE_CONVERT:
3481 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3482 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3483 break;
3484 if (TYPE_PRECISION (TREE_TYPE (op0))
3485 == TYPE_PRECISION (TREE_TYPE (off)))
3487 off = op0;
3488 continue;
3490 if (TYPE_PRECISION (TREE_TYPE (op0))
3491 < TYPE_PRECISION (TREE_TYPE (off)))
3493 off = op0;
3494 offtype = TREE_TYPE (off);
3495 STRIP_NOPS (off);
3496 continue;
3498 break;
3499 default:
3500 break;
3502 break;
3505 /* If at the end OFF still isn't a SSA_NAME or isn't
3506 defined in the loop, punt. */
3507 if (TREE_CODE (off) != SSA_NAME
3508 || expr_invariant_in_loop_p (loop, off))
3509 return false;
3511 if (offtype == NULL_TREE)
3512 offtype = TREE_TYPE (off);
3514 if (DR_IS_READ (dr))
3515 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3516 offtype, scale);
3517 else
3518 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3519 offtype, scale);
3521 if (decl == NULL_TREE)
3522 return false;
3524 info->decl = decl;
3525 info->base = base;
3526 info->offset = off;
3527 info->offset_dt = vect_unknown_def_type;
3528 info->offset_vectype = NULL_TREE;
3529 info->scale = scale;
3530 return true;
3533 /* Function vect_analyze_data_refs.
3535 Find all the data references in the loop or basic block.
3537 The general structure of the analysis of data refs in the vectorizer is as
3538 follows:
3539 1- vect_analyze_data_refs(loop/bb): call
3540 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3541 in the loop/bb and their dependences.
3542 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3543 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3544 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3548 bool
3549 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3551 struct loop *loop = NULL;
3552 unsigned int i;
3553 struct data_reference *dr;
3554 tree scalar_type;
3556 if (dump_enabled_p ())
3557 dump_printf_loc (MSG_NOTE, vect_location,
3558 "=== vect_analyze_data_refs ===\n");
3560 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3561 loop = LOOP_VINFO_LOOP (loop_vinfo);
3563 /* Go through the data-refs, check that the analysis succeeded. Update
3564 pointer from stmt_vec_info struct to DR and vectype. */
3566 vec<data_reference_p> datarefs = vinfo->datarefs;
3567 FOR_EACH_VEC_ELT (datarefs, i, dr)
3569 gimple *stmt;
3570 stmt_vec_info stmt_info;
3571 tree base, offset, init;
3572 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3573 bool simd_lane_access = false;
3574 int vf;
3576 again:
3577 if (!dr || !DR_REF (dr))
3579 if (dump_enabled_p ())
3580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3581 "not vectorized: unhandled data-ref\n");
3582 return false;
3585 stmt = DR_STMT (dr);
3586 stmt_info = vinfo_for_stmt (stmt);
3588 /* Discard clobbers from the dataref vector. We will remove
3589 clobber stmts during vectorization. */
3590 if (gimple_clobber_p (stmt))
3592 free_data_ref (dr);
3593 if (i == datarefs.length () - 1)
3595 datarefs.pop ();
3596 break;
3598 datarefs.ordered_remove (i);
3599 dr = datarefs[i];
3600 goto again;
3603 /* Check that analysis of the data-ref succeeded. */
3604 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3605 || !DR_STEP (dr))
3607 bool maybe_gather
3608 = DR_IS_READ (dr)
3609 && !TREE_THIS_VOLATILE (DR_REF (dr))
3610 && targetm.vectorize.builtin_gather != NULL;
3611 bool maybe_scatter
3612 = DR_IS_WRITE (dr)
3613 && !TREE_THIS_VOLATILE (DR_REF (dr))
3614 && targetm.vectorize.builtin_scatter != NULL;
3615 bool maybe_simd_lane_access
3616 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3618 /* If target supports vector gather loads or scatter stores, or if
3619 this might be a SIMD lane access, see if they can't be used. */
3620 if (is_a <loop_vec_info> (vinfo)
3621 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3622 && !nested_in_vect_loop_p (loop, stmt))
3624 struct data_reference *newdr
3625 = create_data_ref (NULL, loop_containing_stmt (stmt),
3626 DR_REF (dr), stmt, maybe_scatter ? false : true);
3627 gcc_assert (newdr != NULL && DR_REF (newdr));
3628 if (DR_BASE_ADDRESS (newdr)
3629 && DR_OFFSET (newdr)
3630 && DR_INIT (newdr)
3631 && DR_STEP (newdr)
3632 && integer_zerop (DR_STEP (newdr)))
3634 if (maybe_simd_lane_access)
3636 tree off = DR_OFFSET (newdr);
3637 STRIP_NOPS (off);
3638 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3639 && TREE_CODE (off) == MULT_EXPR
3640 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3642 tree step = TREE_OPERAND (off, 1);
3643 off = TREE_OPERAND (off, 0);
3644 STRIP_NOPS (off);
3645 if (CONVERT_EXPR_P (off)
3646 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3647 0)))
3648 < TYPE_PRECISION (TREE_TYPE (off)))
3649 off = TREE_OPERAND (off, 0);
3650 if (TREE_CODE (off) == SSA_NAME)
3652 gimple *def = SSA_NAME_DEF_STMT (off);
3653 tree reft = TREE_TYPE (DR_REF (newdr));
3654 if (is_gimple_call (def)
3655 && gimple_call_internal_p (def)
3656 && (gimple_call_internal_fn (def)
3657 == IFN_GOMP_SIMD_LANE))
3659 tree arg = gimple_call_arg (def, 0);
3660 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3661 arg = SSA_NAME_VAR (arg);
3662 if (arg == loop->simduid
3663 /* For now. */
3664 && tree_int_cst_equal
3665 (TYPE_SIZE_UNIT (reft),
3666 step))
3668 DR_OFFSET (newdr) = ssize_int (0);
3669 DR_STEP (newdr) = step;
3670 DR_ALIGNED_TO (newdr)
3671 = size_int (BIGGEST_ALIGNMENT);
3672 dr = newdr;
3673 simd_lane_access = true;
3679 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3681 dr = newdr;
3682 if (maybe_gather)
3683 gatherscatter = GATHER;
3684 else
3685 gatherscatter = SCATTER;
3688 if (gatherscatter == SG_NONE && !simd_lane_access)
3689 free_data_ref (newdr);
3692 if (gatherscatter == SG_NONE && !simd_lane_access)
3694 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697 "not vectorized: data ref analysis "
3698 "failed ");
3699 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3702 if (is_a <bb_vec_info> (vinfo))
3703 break;
3705 return false;
3709 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3711 if (dump_enabled_p ())
3712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3713 "not vectorized: base addr of dr is a "
3714 "constant\n");
3716 if (is_a <bb_vec_info> (vinfo))
3717 break;
3719 if (gatherscatter != SG_NONE || simd_lane_access)
3720 free_data_ref (dr);
3721 return false;
3724 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3726 if (dump_enabled_p ())
3728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3729 "not vectorized: volatile type ");
3730 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3733 if (is_a <bb_vec_info> (vinfo))
3734 break;
3736 return false;
3739 if (stmt_can_throw_internal (stmt))
3741 if (dump_enabled_p ())
3743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3744 "not vectorized: statement can throw an "
3745 "exception ");
3746 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3749 if (is_a <bb_vec_info> (vinfo))
3750 break;
3752 if (gatherscatter != SG_NONE || simd_lane_access)
3753 free_data_ref (dr);
3754 return false;
3757 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3758 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3760 if (dump_enabled_p ())
3762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3763 "not vectorized: statement is bitfield "
3764 "access ");
3765 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3768 if (is_a <bb_vec_info> (vinfo))
3769 break;
3771 if (gatherscatter != SG_NONE || simd_lane_access)
3772 free_data_ref (dr);
3773 return false;
3776 base = unshare_expr (DR_BASE_ADDRESS (dr));
3777 offset = unshare_expr (DR_OFFSET (dr));
3778 init = unshare_expr (DR_INIT (dr));
3780 if (is_gimple_call (stmt)
3781 && (!gimple_call_internal_p (stmt)
3782 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3783 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3785 if (dump_enabled_p ())
3787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3788 "not vectorized: dr in a call ");
3789 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3792 if (is_a <bb_vec_info> (vinfo))
3793 break;
3795 if (gatherscatter != SG_NONE || simd_lane_access)
3796 free_data_ref (dr);
3797 return false;
3800 /* Update DR field in stmt_vec_info struct. */
3802 /* If the dataref is in an inner-loop of the loop that is considered for
3803 for vectorization, we also want to analyze the access relative to
3804 the outer-loop (DR contains information only relative to the
3805 inner-most enclosing loop). We do that by building a reference to the
3806 first location accessed by the inner-loop, and analyze it relative to
3807 the outer-loop. */
3808 if (loop && nested_in_vect_loop_p (loop, stmt))
3810 tree outer_step, outer_base, outer_init;
3811 HOST_WIDE_INT pbitsize, pbitpos;
3812 tree poffset;
3813 machine_mode pmode;
3814 int punsignedp, preversep, pvolatilep;
3815 affine_iv base_iv, offset_iv;
3816 tree dinit;
3818 /* Build a reference to the first location accessed by the
3819 inner-loop: *(BASE+INIT). (The first location is actually
3820 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3821 tree inner_base = build_fold_indirect_ref
3822 (fold_build_pointer_plus (base, init));
3824 if (dump_enabled_p ())
3826 dump_printf_loc (MSG_NOTE, vect_location,
3827 "analyze in outer-loop: ");
3828 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3829 dump_printf (MSG_NOTE, "\n");
3832 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3833 &poffset, &pmode, &punsignedp,
3834 &preversep, &pvolatilep);
3835 gcc_assert (outer_base != NULL_TREE);
3837 if (pbitpos % BITS_PER_UNIT != 0)
3839 if (dump_enabled_p ())
3840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3841 "failed: bit offset alignment.\n");
3842 return false;
3845 if (preversep)
3847 if (dump_enabled_p ())
3848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3849 "failed: reverse storage order.\n");
3850 return false;
3853 outer_base = build_fold_addr_expr (outer_base);
3854 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3855 &base_iv, false))
3857 if (dump_enabled_p ())
3858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3859 "failed: evolution of base is not affine.\n");
3860 return false;
3863 if (offset)
3865 if (poffset)
3866 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3867 poffset);
3868 else
3869 poffset = offset;
3872 if (!poffset)
3874 offset_iv.base = ssize_int (0);
3875 offset_iv.step = ssize_int (0);
3877 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3878 &offset_iv, false))
3880 if (dump_enabled_p ())
3881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3882 "evolution of offset is not affine.\n");
3883 return false;
3886 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3887 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3888 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3889 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3890 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3892 outer_step = size_binop (PLUS_EXPR,
3893 fold_convert (ssizetype, base_iv.step),
3894 fold_convert (ssizetype, offset_iv.step));
3896 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3897 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3898 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3899 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3900 STMT_VINFO_DR_OFFSET (stmt_info) =
3901 fold_convert (ssizetype, offset_iv.base);
3902 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3903 size_int (highest_pow2_factor (offset_iv.base));
3905 if (dump_enabled_p ())
3907 dump_printf_loc (MSG_NOTE, vect_location,
3908 "\touter base_address: ");
3909 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3910 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3911 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3912 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3913 STMT_VINFO_DR_OFFSET (stmt_info));
3914 dump_printf (MSG_NOTE,
3915 "\n\touter constant offset from base address: ");
3916 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3917 STMT_VINFO_DR_INIT (stmt_info));
3918 dump_printf (MSG_NOTE, "\n\touter step: ");
3919 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3920 STMT_VINFO_DR_STEP (stmt_info));
3921 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3922 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3923 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3924 dump_printf (MSG_NOTE, "\n");
3928 if (STMT_VINFO_DATA_REF (stmt_info))
3930 if (dump_enabled_p ())
3932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3933 "not vectorized: more than one data ref "
3934 "in stmt: ");
3935 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3938 if (is_a <bb_vec_info> (vinfo))
3939 break;
3941 if (gatherscatter != SG_NONE || simd_lane_access)
3942 free_data_ref (dr);
3943 return false;
3946 STMT_VINFO_DATA_REF (stmt_info) = dr;
3947 if (simd_lane_access)
3949 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3950 free_data_ref (datarefs[i]);
3951 datarefs[i] = dr;
3954 /* Set vectype for STMT. */
3955 scalar_type = TREE_TYPE (DR_REF (dr));
3956 STMT_VINFO_VECTYPE (stmt_info)
3957 = get_vectype_for_scalar_type (scalar_type);
3958 if (!STMT_VINFO_VECTYPE (stmt_info))
3960 if (dump_enabled_p ())
3962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3963 "not vectorized: no vectype for stmt: ");
3964 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3965 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3966 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3967 scalar_type);
3968 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3971 if (is_a <bb_vec_info> (vinfo))
3973 /* No vector type is fine, the ref can still participate
3974 in dependence analysis, we just can't vectorize it. */
3975 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3976 continue;
3979 if (gatherscatter != SG_NONE || simd_lane_access)
3981 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3982 if (gatherscatter != SG_NONE)
3983 free_data_ref (dr);
3985 return false;
3987 else
3989 if (dump_enabled_p ())
3991 dump_printf_loc (MSG_NOTE, vect_location,
3992 "got vectype for stmt: ");
3993 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3994 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3995 STMT_VINFO_VECTYPE (stmt_info));
3996 dump_printf (MSG_NOTE, "\n");
4000 /* Adjust the minimal vectorization factor according to the
4001 vector type. */
4002 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4003 if (vf > *min_vf)
4004 *min_vf = vf;
4006 if (gatherscatter != SG_NONE)
4008 gather_scatter_info gs_info;
4009 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4010 &gs_info)
4011 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4013 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4014 free_data_ref (dr);
4015 if (dump_enabled_p ())
4017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4018 (gatherscatter == GATHER) ?
4019 "not vectorized: not suitable for gather "
4020 "load " :
4021 "not vectorized: not suitable for scatter "
4022 "store ");
4023 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4025 return false;
4028 free_data_ref (datarefs[i]);
4029 datarefs[i] = dr;
4030 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4033 else if (is_a <loop_vec_info> (vinfo)
4034 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4036 if (nested_in_vect_loop_p (loop, stmt))
4038 if (dump_enabled_p ())
4040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4041 "not vectorized: not suitable for strided "
4042 "load ");
4043 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4045 return false;
4047 STMT_VINFO_STRIDED_P (stmt_info) = true;
4051 /* If we stopped analysis at the first dataref we could not analyze
4052 when trying to vectorize a basic-block mark the rest of the datarefs
4053 as not vectorizable and truncate the vector of datarefs. That
4054 avoids spending useless time in analyzing their dependence. */
4055 if (i != datarefs.length ())
4057 gcc_assert (is_a <bb_vec_info> (vinfo));
4058 for (unsigned j = i; j < datarefs.length (); ++j)
4060 data_reference_p dr = datarefs[j];
4061 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4062 free_data_ref (dr);
4064 datarefs.truncate (i);
4067 return true;
4071 /* Function vect_get_new_vect_var.
4073 Returns a name for a new variable. The current naming scheme appends the
4074 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4075 the name of vectorizer generated variables, and appends that to NAME if
4076 provided. */
4078 tree
4079 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4081 const char *prefix;
4082 tree new_vect_var;
4084 switch (var_kind)
4086 case vect_simple_var:
4087 prefix = "vect";
4088 break;
4089 case vect_scalar_var:
4090 prefix = "stmp";
4091 break;
4092 case vect_mask_var:
4093 prefix = "mask";
4094 break;
4095 case vect_pointer_var:
4096 prefix = "vectp";
4097 break;
4098 default:
4099 gcc_unreachable ();
4102 if (name)
4104 char* tmp = concat (prefix, "_", name, NULL);
4105 new_vect_var = create_tmp_reg (type, tmp);
4106 free (tmp);
4108 else
4109 new_vect_var = create_tmp_reg (type, prefix);
4111 return new_vect_var;
4114 /* Like vect_get_new_vect_var but return an SSA name. */
4116 tree
4117 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4119 const char *prefix;
4120 tree new_vect_var;
4122 switch (var_kind)
4124 case vect_simple_var:
4125 prefix = "vect";
4126 break;
4127 case vect_scalar_var:
4128 prefix = "stmp";
4129 break;
4130 case vect_pointer_var:
4131 prefix = "vectp";
4132 break;
4133 default:
4134 gcc_unreachable ();
4137 if (name)
4139 char* tmp = concat (prefix, "_", name, NULL);
4140 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4141 free (tmp);
4143 else
4144 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4146 return new_vect_var;
4149 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4151 static void
4152 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4153 stmt_vec_info stmt_info)
4155 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4156 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4157 int misalign = DR_MISALIGNMENT (dr);
4158 if (misalign == -1)
4159 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4160 else
4161 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4164 /* Function vect_create_addr_base_for_vector_ref.
4166 Create an expression that computes the address of the first memory location
4167 that will be accessed for a data reference.
4169 Input:
4170 STMT: The statement containing the data reference.
4171 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4172 OFFSET: Optional. If supplied, it is be added to the initial address.
4173 LOOP: Specify relative to which loop-nest should the address be computed.
4174 For example, when the dataref is in an inner-loop nested in an
4175 outer-loop that is now being vectorized, LOOP can be either the
4176 outer-loop, or the inner-loop. The first memory location accessed
4177 by the following dataref ('in' points to short):
4179 for (i=0; i<N; i++)
4180 for (j=0; j<M; j++)
4181 s += in[i+j]
4183 is as follows:
4184 if LOOP=i_loop: &in (relative to i_loop)
4185 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4186 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4187 initial address. Unlike OFFSET, which is number of elements to
4188 be added, BYTE_OFFSET is measured in bytes.
4190 Output:
4191 1. Return an SSA_NAME whose value is the address of the memory location of
4192 the first vector of the data reference.
4193 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4194 these statement(s) which define the returned SSA_NAME.
4196 FORNOW: We are only handling array accesses with step 1. */
4198 tree
4199 vect_create_addr_base_for_vector_ref (gimple *stmt,
4200 gimple_seq *new_stmt_list,
4201 tree offset,
4202 struct loop *loop,
4203 tree byte_offset)
4205 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4206 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4207 tree data_ref_base;
4208 const char *base_name;
4209 tree addr_base;
4210 tree dest;
4211 gimple_seq seq = NULL;
4212 tree base_offset;
4213 tree init;
4214 tree vect_ptr_type;
4215 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4216 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4218 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4220 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4222 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4224 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4225 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4226 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4228 else
4230 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4231 base_offset = unshare_expr (DR_OFFSET (dr));
4232 init = unshare_expr (DR_INIT (dr));
4235 if (loop_vinfo)
4236 base_name = get_name (data_ref_base);
4237 else
4239 base_offset = ssize_int (0);
4240 init = ssize_int (0);
4241 base_name = get_name (DR_REF (dr));
4244 /* Create base_offset */
4245 base_offset = size_binop (PLUS_EXPR,
4246 fold_convert (sizetype, base_offset),
4247 fold_convert (sizetype, init));
4249 if (offset)
4251 offset = fold_build2 (MULT_EXPR, sizetype,
4252 fold_convert (sizetype, offset), step);
4253 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4254 base_offset, offset);
4256 if (byte_offset)
4258 byte_offset = fold_convert (sizetype, byte_offset);
4259 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4260 base_offset, byte_offset);
4263 /* base + base_offset */
4264 if (loop_vinfo)
4265 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4266 else
4268 addr_base = build1 (ADDR_EXPR,
4269 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4270 unshare_expr (DR_REF (dr)));
4273 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4274 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4275 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4276 gimple_seq_add_seq (new_stmt_list, seq);
4278 if (DR_PTR_INFO (dr)
4279 && TREE_CODE (addr_base) == SSA_NAME
4280 && !SSA_NAME_PTR_INFO (addr_base))
4282 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4283 if (offset || byte_offset)
4284 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4287 if (dump_enabled_p ())
4289 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4290 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4291 dump_printf (MSG_NOTE, "\n");
4294 return addr_base;
4298 /* Function vect_create_data_ref_ptr.
4300 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4301 location accessed in the loop by STMT, along with the def-use update
4302 chain to appropriately advance the pointer through the loop iterations.
4303 Also set aliasing information for the pointer. This pointer is used by
4304 the callers to this function to create a memory reference expression for
4305 vector load/store access.
4307 Input:
4308 1. STMT: a stmt that references memory. Expected to be of the form
4309 GIMPLE_ASSIGN <name, data-ref> or
4310 GIMPLE_ASSIGN <data-ref, name>.
4311 2. AGGR_TYPE: the type of the reference, which should be either a vector
4312 or an array.
4313 3. AT_LOOP: the loop where the vector memref is to be created.
4314 4. OFFSET (optional): an offset to be added to the initial address accessed
4315 by the data-ref in STMT.
4316 5. BSI: location where the new stmts are to be placed if there is no loop
4317 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4318 pointing to the initial address.
4319 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4320 to the initial address accessed by the data-ref in STMT. This is
4321 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4322 in bytes.
4324 Output:
4325 1. Declare a new ptr to vector_type, and have it point to the base of the
4326 data reference (initial addressed accessed by the data reference).
4327 For example, for vector of type V8HI, the following code is generated:
4329 v8hi *ap;
4330 ap = (v8hi *)initial_address;
4332 if OFFSET is not supplied:
4333 initial_address = &a[init];
4334 if OFFSET is supplied:
4335 initial_address = &a[init + OFFSET];
4336 if BYTE_OFFSET is supplied:
4337 initial_address = &a[init] + BYTE_OFFSET;
4339 Return the initial_address in INITIAL_ADDRESS.
4341 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4342 update the pointer in each iteration of the loop.
4344 Return the increment stmt that updates the pointer in PTR_INCR.
4346 3. Set INV_P to true if the access pattern of the data reference in the
4347 vectorized loop is invariant. Set it to false otherwise.
4349 4. Return the pointer. */
4351 tree
4352 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4353 tree offset, tree *initial_address,
4354 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4355 bool only_init, bool *inv_p, tree byte_offset)
4357 const char *base_name;
4358 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4359 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4360 struct loop *loop = NULL;
4361 bool nested_in_vect_loop = false;
4362 struct loop *containing_loop = NULL;
4363 tree aggr_ptr_type;
4364 tree aggr_ptr;
4365 tree new_temp;
4366 gimple_seq new_stmt_list = NULL;
4367 edge pe = NULL;
4368 basic_block new_bb;
4369 tree aggr_ptr_init;
4370 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4371 tree aptr;
4372 gimple_stmt_iterator incr_gsi;
4373 bool insert_after;
4374 tree indx_before_incr, indx_after_incr;
4375 gimple *incr;
4376 tree step;
4377 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4379 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4380 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4382 if (loop_vinfo)
4384 loop = LOOP_VINFO_LOOP (loop_vinfo);
4385 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4386 containing_loop = (gimple_bb (stmt))->loop_father;
4387 pe = loop_preheader_edge (loop);
4389 else
4391 gcc_assert (bb_vinfo);
4392 only_init = true;
4393 *ptr_incr = NULL;
4396 /* Check the step (evolution) of the load in LOOP, and record
4397 whether it's invariant. */
4398 if (nested_in_vect_loop)
4399 step = STMT_VINFO_DR_STEP (stmt_info);
4400 else
4401 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4403 if (integer_zerop (step))
4404 *inv_p = true;
4405 else
4406 *inv_p = false;
4408 /* Create an expression for the first address accessed by this load
4409 in LOOP. */
4410 base_name = get_name (DR_BASE_ADDRESS (dr));
4412 if (dump_enabled_p ())
4414 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4415 dump_printf_loc (MSG_NOTE, vect_location,
4416 "create %s-pointer variable to type: ",
4417 get_tree_code_name (TREE_CODE (aggr_type)));
4418 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4419 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4420 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4421 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4422 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4423 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4424 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4425 else
4426 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4427 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4428 dump_printf (MSG_NOTE, "\n");
4431 /* (1) Create the new aggregate-pointer variable.
4432 Vector and array types inherit the alias set of their component
4433 type by default so we need to use a ref-all pointer if the data
4434 reference does not conflict with the created aggregated data
4435 reference because it is not addressable. */
4436 bool need_ref_all = false;
4437 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4438 get_alias_set (DR_REF (dr))))
4439 need_ref_all = true;
4440 /* Likewise for any of the data references in the stmt group. */
4441 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4443 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4446 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4447 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4448 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4449 get_alias_set (DR_REF (sdr))))
4451 need_ref_all = true;
4452 break;
4454 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4456 while (orig_stmt);
4458 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4459 need_ref_all);
4460 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4463 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4464 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4465 def-use update cycles for the pointer: one relative to the outer-loop
4466 (LOOP), which is what steps (3) and (4) below do. The other is relative
4467 to the inner-loop (which is the inner-most loop containing the dataref),
4468 and this is done be step (5) below.
4470 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4471 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4472 redundant. Steps (3),(4) create the following:
4474 vp0 = &base_addr;
4475 LOOP: vp1 = phi(vp0,vp2)
4478 vp2 = vp1 + step
4479 goto LOOP
4481 If there is an inner-loop nested in loop, then step (5) will also be
4482 applied, and an additional update in the inner-loop will be created:
4484 vp0 = &base_addr;
4485 LOOP: vp1 = phi(vp0,vp2)
4487 inner: vp3 = phi(vp1,vp4)
4488 vp4 = vp3 + inner_step
4489 if () goto inner
4491 vp2 = vp1 + step
4492 if () goto LOOP */
4494 /* (2) Calculate the initial address of the aggregate-pointer, and set
4495 the aggregate-pointer to point to it before the loop. */
4497 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4499 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4500 offset, loop, byte_offset);
4501 if (new_stmt_list)
4503 if (pe)
4505 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4506 gcc_assert (!new_bb);
4508 else
4509 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4512 *initial_address = new_temp;
4513 aggr_ptr_init = new_temp;
4515 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4516 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4517 inner-loop nested in LOOP (during outer-loop vectorization). */
4519 /* No update in loop is required. */
4520 if (only_init && (!loop_vinfo || at_loop == loop))
4521 aptr = aggr_ptr_init;
4522 else
4524 /* The step of the aggregate pointer is the type size. */
4525 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4526 /* One exception to the above is when the scalar step of the load in
4527 LOOP is zero. In this case the step here is also zero. */
4528 if (*inv_p)
4529 iv_step = size_zero_node;
4530 else if (tree_int_cst_sgn (step) == -1)
4531 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4533 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4535 create_iv (aggr_ptr_init,
4536 fold_convert (aggr_ptr_type, iv_step),
4537 aggr_ptr, loop, &incr_gsi, insert_after,
4538 &indx_before_incr, &indx_after_incr);
4539 incr = gsi_stmt (incr_gsi);
4540 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4542 /* Copy the points-to information if it exists. */
4543 if (DR_PTR_INFO (dr))
4545 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4546 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4548 if (ptr_incr)
4549 *ptr_incr = incr;
4551 aptr = indx_before_incr;
4554 if (!nested_in_vect_loop || only_init)
4555 return aptr;
4558 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4559 nested in LOOP, if exists. */
4561 gcc_assert (nested_in_vect_loop);
4562 if (!only_init)
4564 standard_iv_increment_position (containing_loop, &incr_gsi,
4565 &insert_after);
4566 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4567 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4568 &indx_after_incr);
4569 incr = gsi_stmt (incr_gsi);
4570 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4572 /* Copy the points-to information if it exists. */
4573 if (DR_PTR_INFO (dr))
4575 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4576 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4578 if (ptr_incr)
4579 *ptr_incr = incr;
4581 return indx_before_incr;
4583 else
4584 gcc_unreachable ();
4588 /* Function bump_vector_ptr
4590 Increment a pointer (to a vector type) by vector-size. If requested,
4591 i.e. if PTR-INCR is given, then also connect the new increment stmt
4592 to the existing def-use update-chain of the pointer, by modifying
4593 the PTR_INCR as illustrated below:
4595 The pointer def-use update-chain before this function:
4596 DATAREF_PTR = phi (p_0, p_2)
4597 ....
4598 PTR_INCR: p_2 = DATAREF_PTR + step
4600 The pointer def-use update-chain after this function:
4601 DATAREF_PTR = phi (p_0, p_2)
4602 ....
4603 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4604 ....
4605 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4607 Input:
4608 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4609 in the loop.
4610 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4611 the loop. The increment amount across iterations is expected
4612 to be vector_size.
4613 BSI - location where the new update stmt is to be placed.
4614 STMT - the original scalar memory-access stmt that is being vectorized.
4615 BUMP - optional. The offset by which to bump the pointer. If not given,
4616 the offset is assumed to be vector_size.
4618 Output: Return NEW_DATAREF_PTR as illustrated above.
4622 tree
4623 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4624 gimple *stmt, tree bump)
4626 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4627 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4628 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4629 tree update = TYPE_SIZE_UNIT (vectype);
4630 gassign *incr_stmt;
4631 ssa_op_iter iter;
4632 use_operand_p use_p;
4633 tree new_dataref_ptr;
4635 if (bump)
4636 update = bump;
4638 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4639 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4640 else
4641 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4642 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4643 dataref_ptr, update);
4644 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4646 /* Copy the points-to information if it exists. */
4647 if (DR_PTR_INFO (dr))
4649 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4650 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4653 if (!ptr_incr)
4654 return new_dataref_ptr;
4656 /* Update the vector-pointer's cross-iteration increment. */
4657 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4659 tree use = USE_FROM_PTR (use_p);
4661 if (use == dataref_ptr)
4662 SET_USE (use_p, new_dataref_ptr);
4663 else
4664 gcc_assert (tree_int_cst_compare (use, update) == 0);
4667 return new_dataref_ptr;
4671 /* Function vect_create_destination_var.
4673 Create a new temporary of type VECTYPE. */
4675 tree
4676 vect_create_destination_var (tree scalar_dest, tree vectype)
4678 tree vec_dest;
4679 const char *name;
4680 char *new_name;
4681 tree type;
4682 enum vect_var_kind kind;
4684 kind = vectype
4685 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4686 ? vect_mask_var
4687 : vect_simple_var
4688 : vect_scalar_var;
4689 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4691 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4693 name = get_name (scalar_dest);
4694 if (name)
4695 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4696 else
4697 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4698 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4699 free (new_name);
4701 return vec_dest;
4704 /* Function vect_grouped_store_supported.
4706 Returns TRUE if interleave high and interleave low permutations
4707 are supported, and FALSE otherwise. */
4709 bool
4710 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4712 machine_mode mode = TYPE_MODE (vectype);
4714 /* vect_permute_store_chain requires the group size to be equal to 3 or
4715 be a power of two. */
4716 if (count != 3 && exact_log2 (count) == -1)
4718 if (dump_enabled_p ())
4719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4720 "the size of the group of accesses"
4721 " is not a power of 2 or not eqaul to 3\n");
4722 return false;
4725 /* Check that the permutation is supported. */
4726 if (VECTOR_MODE_P (mode))
4728 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4729 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4731 if (count == 3)
4733 unsigned int j0 = 0, j1 = 0, j2 = 0;
4734 unsigned int i, j;
4736 for (j = 0; j < 3; j++)
4738 int nelt0 = ((3 - j) * nelt) % 3;
4739 int nelt1 = ((3 - j) * nelt + 1) % 3;
4740 int nelt2 = ((3 - j) * nelt + 2) % 3;
4741 for (i = 0; i < nelt; i++)
4743 if (3 * i + nelt0 < nelt)
4744 sel[3 * i + nelt0] = j0++;
4745 if (3 * i + nelt1 < nelt)
4746 sel[3 * i + nelt1] = nelt + j1++;
4747 if (3 * i + nelt2 < nelt)
4748 sel[3 * i + nelt2] = 0;
4750 if (!can_vec_perm_p (mode, false, sel))
4752 if (dump_enabled_p ())
4753 dump_printf (MSG_MISSED_OPTIMIZATION,
4754 "permutaion op not supported by target.\n");
4755 return false;
4758 for (i = 0; i < nelt; i++)
4760 if (3 * i + nelt0 < nelt)
4761 sel[3 * i + nelt0] = 3 * i + nelt0;
4762 if (3 * i + nelt1 < nelt)
4763 sel[3 * i + nelt1] = 3 * i + nelt1;
4764 if (3 * i + nelt2 < nelt)
4765 sel[3 * i + nelt2] = nelt + j2++;
4767 if (!can_vec_perm_p (mode, false, sel))
4769 if (dump_enabled_p ())
4770 dump_printf (MSG_MISSED_OPTIMIZATION,
4771 "permutaion op not supported by target.\n");
4772 return false;
4775 return true;
4777 else
4779 /* If length is not equal to 3 then only power of 2 is supported. */
4780 gcc_assert (pow2p_hwi (count));
4782 for (i = 0; i < nelt / 2; i++)
4784 sel[i * 2] = i;
4785 sel[i * 2 + 1] = i + nelt;
4787 if (can_vec_perm_p (mode, false, sel))
4789 for (i = 0; i < nelt; i++)
4790 sel[i] += nelt / 2;
4791 if (can_vec_perm_p (mode, false, sel))
4792 return true;
4797 if (dump_enabled_p ())
4798 dump_printf (MSG_MISSED_OPTIMIZATION,
4799 "permutaion op not supported by target.\n");
4800 return false;
4804 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4805 type VECTYPE. */
4807 bool
4808 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4810 return vect_lanes_optab_supported_p ("vec_store_lanes",
4811 vec_store_lanes_optab,
4812 vectype, count);
4816 /* Function vect_permute_store_chain.
4818 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4819 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4820 the data correctly for the stores. Return the final references for stores
4821 in RESULT_CHAIN.
4823 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4824 The input is 4 vectors each containing 8 elements. We assign a number to
4825 each element, the input sequence is:
4827 1st vec: 0 1 2 3 4 5 6 7
4828 2nd vec: 8 9 10 11 12 13 14 15
4829 3rd vec: 16 17 18 19 20 21 22 23
4830 4th vec: 24 25 26 27 28 29 30 31
4832 The output sequence should be:
4834 1st vec: 0 8 16 24 1 9 17 25
4835 2nd vec: 2 10 18 26 3 11 19 27
4836 3rd vec: 4 12 20 28 5 13 21 30
4837 4th vec: 6 14 22 30 7 15 23 31
4839 i.e., we interleave the contents of the four vectors in their order.
4841 We use interleave_high/low instructions to create such output. The input of
4842 each interleave_high/low operation is two vectors:
4843 1st vec 2nd vec
4844 0 1 2 3 4 5 6 7
4845 the even elements of the result vector are obtained left-to-right from the
4846 high/low elements of the first vector. The odd elements of the result are
4847 obtained left-to-right from the high/low elements of the second vector.
4848 The output of interleave_high will be: 0 4 1 5
4849 and of interleave_low: 2 6 3 7
4852 The permutation is done in log LENGTH stages. In each stage interleave_high
4853 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4854 where the first argument is taken from the first half of DR_CHAIN and the
4855 second argument from it's second half.
4856 In our example,
4858 I1: interleave_high (1st vec, 3rd vec)
4859 I2: interleave_low (1st vec, 3rd vec)
4860 I3: interleave_high (2nd vec, 4th vec)
4861 I4: interleave_low (2nd vec, 4th vec)
4863 The output for the first stage is:
4865 I1: 0 16 1 17 2 18 3 19
4866 I2: 4 20 5 21 6 22 7 23
4867 I3: 8 24 9 25 10 26 11 27
4868 I4: 12 28 13 29 14 30 15 31
4870 The output of the second stage, i.e. the final result is:
4872 I1: 0 8 16 24 1 9 17 25
4873 I2: 2 10 18 26 3 11 19 27
4874 I3: 4 12 20 28 5 13 21 30
4875 I4: 6 14 22 30 7 15 23 31. */
4877 void
4878 vect_permute_store_chain (vec<tree> dr_chain,
4879 unsigned int length,
4880 gimple *stmt,
4881 gimple_stmt_iterator *gsi,
4882 vec<tree> *result_chain)
4884 tree vect1, vect2, high, low;
4885 gimple *perm_stmt;
4886 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4887 tree perm_mask_low, perm_mask_high;
4888 tree data_ref;
4889 tree perm3_mask_low, perm3_mask_high;
4890 unsigned int i, n, log_length = exact_log2 (length);
4891 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4892 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4894 result_chain->quick_grow (length);
4895 memcpy (result_chain->address (), dr_chain.address (),
4896 length * sizeof (tree));
4898 if (length == 3)
4900 unsigned int j0 = 0, j1 = 0, j2 = 0;
4902 for (j = 0; j < 3; j++)
4904 int nelt0 = ((3 - j) * nelt) % 3;
4905 int nelt1 = ((3 - j) * nelt + 1) % 3;
4906 int nelt2 = ((3 - j) * nelt + 2) % 3;
4908 for (i = 0; i < nelt; i++)
4910 if (3 * i + nelt0 < nelt)
4911 sel[3 * i + nelt0] = j0++;
4912 if (3 * i + nelt1 < nelt)
4913 sel[3 * i + nelt1] = nelt + j1++;
4914 if (3 * i + nelt2 < nelt)
4915 sel[3 * i + nelt2] = 0;
4917 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4919 for (i = 0; i < nelt; i++)
4921 if (3 * i + nelt0 < nelt)
4922 sel[3 * i + nelt0] = 3 * i + nelt0;
4923 if (3 * i + nelt1 < nelt)
4924 sel[3 * i + nelt1] = 3 * i + nelt1;
4925 if (3 * i + nelt2 < nelt)
4926 sel[3 * i + nelt2] = nelt + j2++;
4928 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4930 vect1 = dr_chain[0];
4931 vect2 = dr_chain[1];
4933 /* Create interleaving stmt:
4934 low = VEC_PERM_EXPR <vect1, vect2,
4935 {j, nelt, *, j + 1, nelt + j + 1, *,
4936 j + 2, nelt + j + 2, *, ...}> */
4937 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4938 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4939 vect2, perm3_mask_low);
4940 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4942 vect1 = data_ref;
4943 vect2 = dr_chain[2];
4944 /* Create interleaving stmt:
4945 low = VEC_PERM_EXPR <vect1, vect2,
4946 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4947 6, 7, nelt + j + 2, ...}> */
4948 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4949 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4950 vect2, perm3_mask_high);
4951 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4952 (*result_chain)[j] = data_ref;
4955 else
4957 /* If length is not equal to 3 then only power of 2 is supported. */
4958 gcc_assert (pow2p_hwi (length));
4960 for (i = 0, n = nelt / 2; i < n; i++)
4962 sel[i * 2] = i;
4963 sel[i * 2 + 1] = i + nelt;
4965 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4967 for (i = 0; i < nelt; i++)
4968 sel[i] += nelt / 2;
4969 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4971 for (i = 0, n = log_length; i < n; i++)
4973 for (j = 0; j < length/2; j++)
4975 vect1 = dr_chain[j];
4976 vect2 = dr_chain[j+length/2];
4978 /* Create interleaving stmt:
4979 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4980 ...}> */
4981 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4982 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4983 vect2, perm_mask_high);
4984 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4985 (*result_chain)[2*j] = high;
4987 /* Create interleaving stmt:
4988 low = VEC_PERM_EXPR <vect1, vect2,
4989 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4990 ...}> */
4991 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4992 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4993 vect2, perm_mask_low);
4994 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4995 (*result_chain)[2*j+1] = low;
4997 memcpy (dr_chain.address (), result_chain->address (),
4998 length * sizeof (tree));
5003 /* Function vect_setup_realignment
5005 This function is called when vectorizing an unaligned load using
5006 the dr_explicit_realign[_optimized] scheme.
5007 This function generates the following code at the loop prolog:
5009 p = initial_addr;
5010 x msq_init = *(floor(p)); # prolog load
5011 realignment_token = call target_builtin;
5012 loop:
5013 x msq = phi (msq_init, ---)
5015 The stmts marked with x are generated only for the case of
5016 dr_explicit_realign_optimized.
5018 The code above sets up a new (vector) pointer, pointing to the first
5019 location accessed by STMT, and a "floor-aligned" load using that pointer.
5020 It also generates code to compute the "realignment-token" (if the relevant
5021 target hook was defined), and creates a phi-node at the loop-header bb
5022 whose arguments are the result of the prolog-load (created by this
5023 function) and the result of a load that takes place in the loop (to be
5024 created by the caller to this function).
5026 For the case of dr_explicit_realign_optimized:
5027 The caller to this function uses the phi-result (msq) to create the
5028 realignment code inside the loop, and sets up the missing phi argument,
5029 as follows:
5030 loop:
5031 msq = phi (msq_init, lsq)
5032 lsq = *(floor(p')); # load in loop
5033 result = realign_load (msq, lsq, realignment_token);
5035 For the case of dr_explicit_realign:
5036 loop:
5037 msq = *(floor(p)); # load in loop
5038 p' = p + (VS-1);
5039 lsq = *(floor(p')); # load in loop
5040 result = realign_load (msq, lsq, realignment_token);
5042 Input:
5043 STMT - (scalar) load stmt to be vectorized. This load accesses
5044 a memory location that may be unaligned.
5045 BSI - place where new code is to be inserted.
5046 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5047 is used.
5049 Output:
5050 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5051 target hook, if defined.
5052 Return value - the result of the loop-header phi node. */
5054 tree
5055 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5056 tree *realignment_token,
5057 enum dr_alignment_support alignment_support_scheme,
5058 tree init_addr,
5059 struct loop **at_loop)
5061 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5062 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5063 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5064 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5065 struct loop *loop = NULL;
5066 edge pe = NULL;
5067 tree scalar_dest = gimple_assign_lhs (stmt);
5068 tree vec_dest;
5069 gimple *inc;
5070 tree ptr;
5071 tree data_ref;
5072 basic_block new_bb;
5073 tree msq_init = NULL_TREE;
5074 tree new_temp;
5075 gphi *phi_stmt;
5076 tree msq = NULL_TREE;
5077 gimple_seq stmts = NULL;
5078 bool inv_p;
5079 bool compute_in_loop = false;
5080 bool nested_in_vect_loop = false;
5081 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5082 struct loop *loop_for_initial_load = NULL;
5084 if (loop_vinfo)
5086 loop = LOOP_VINFO_LOOP (loop_vinfo);
5087 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5090 gcc_assert (alignment_support_scheme == dr_explicit_realign
5091 || alignment_support_scheme == dr_explicit_realign_optimized);
5093 /* We need to generate three things:
5094 1. the misalignment computation
5095 2. the extra vector load (for the optimized realignment scheme).
5096 3. the phi node for the two vectors from which the realignment is
5097 done (for the optimized realignment scheme). */
5099 /* 1. Determine where to generate the misalignment computation.
5101 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5102 calculation will be generated by this function, outside the loop (in the
5103 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5104 caller, inside the loop.
5106 Background: If the misalignment remains fixed throughout the iterations of
5107 the loop, then both realignment schemes are applicable, and also the
5108 misalignment computation can be done outside LOOP. This is because we are
5109 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5110 are a multiple of VS (the Vector Size), and therefore the misalignment in
5111 different vectorized LOOP iterations is always the same.
5112 The problem arises only if the memory access is in an inner-loop nested
5113 inside LOOP, which is now being vectorized using outer-loop vectorization.
5114 This is the only case when the misalignment of the memory access may not
5115 remain fixed throughout the iterations of the inner-loop (as explained in
5116 detail in vect_supportable_dr_alignment). In this case, not only is the
5117 optimized realignment scheme not applicable, but also the misalignment
5118 computation (and generation of the realignment token that is passed to
5119 REALIGN_LOAD) have to be done inside the loop.
5121 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5122 or not, which in turn determines if the misalignment is computed inside
5123 the inner-loop, or outside LOOP. */
5125 if (init_addr != NULL_TREE || !loop_vinfo)
5127 compute_in_loop = true;
5128 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5132 /* 2. Determine where to generate the extra vector load.
5134 For the optimized realignment scheme, instead of generating two vector
5135 loads in each iteration, we generate a single extra vector load in the
5136 preheader of the loop, and in each iteration reuse the result of the
5137 vector load from the previous iteration. In case the memory access is in
5138 an inner-loop nested inside LOOP, which is now being vectorized using
5139 outer-loop vectorization, we need to determine whether this initial vector
5140 load should be generated at the preheader of the inner-loop, or can be
5141 generated at the preheader of LOOP. If the memory access has no evolution
5142 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5143 to be generated inside LOOP (in the preheader of the inner-loop). */
5145 if (nested_in_vect_loop)
5147 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5148 bool invariant_in_outerloop =
5149 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5150 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5152 else
5153 loop_for_initial_load = loop;
5154 if (at_loop)
5155 *at_loop = loop_for_initial_load;
5157 if (loop_for_initial_load)
5158 pe = loop_preheader_edge (loop_for_initial_load);
5160 /* 3. For the case of the optimized realignment, create the first vector
5161 load at the loop preheader. */
5163 if (alignment_support_scheme == dr_explicit_realign_optimized)
5165 /* Create msq_init = *(floor(p1)) in the loop preheader */
5166 gassign *new_stmt;
5168 gcc_assert (!compute_in_loop);
5169 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5170 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5171 NULL_TREE, &init_addr, NULL, &inc,
5172 true, &inv_p);
5173 if (TREE_CODE (ptr) == SSA_NAME)
5174 new_temp = copy_ssa_name (ptr);
5175 else
5176 new_temp = make_ssa_name (TREE_TYPE (ptr));
5177 new_stmt = gimple_build_assign
5178 (new_temp, BIT_AND_EXPR, ptr,
5179 build_int_cst (TREE_TYPE (ptr),
5180 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5181 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5182 gcc_assert (!new_bb);
5183 data_ref
5184 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5185 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5186 new_stmt = gimple_build_assign (vec_dest, data_ref);
5187 new_temp = make_ssa_name (vec_dest, new_stmt);
5188 gimple_assign_set_lhs (new_stmt, new_temp);
5189 if (pe)
5191 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5192 gcc_assert (!new_bb);
5194 else
5195 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5197 msq_init = gimple_assign_lhs (new_stmt);
5200 /* 4. Create realignment token using a target builtin, if available.
5201 It is done either inside the containing loop, or before LOOP (as
5202 determined above). */
5204 if (targetm.vectorize.builtin_mask_for_load)
5206 gcall *new_stmt;
5207 tree builtin_decl;
5209 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5210 if (!init_addr)
5212 /* Generate the INIT_ADDR computation outside LOOP. */
5213 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5214 NULL_TREE, loop);
5215 if (loop)
5217 pe = loop_preheader_edge (loop);
5218 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5219 gcc_assert (!new_bb);
5221 else
5222 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5225 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5226 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5227 vec_dest =
5228 vect_create_destination_var (scalar_dest,
5229 gimple_call_return_type (new_stmt));
5230 new_temp = make_ssa_name (vec_dest, new_stmt);
5231 gimple_call_set_lhs (new_stmt, new_temp);
5233 if (compute_in_loop)
5234 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5235 else
5237 /* Generate the misalignment computation outside LOOP. */
5238 pe = loop_preheader_edge (loop);
5239 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5240 gcc_assert (!new_bb);
5243 *realignment_token = gimple_call_lhs (new_stmt);
5245 /* The result of the CALL_EXPR to this builtin is determined from
5246 the value of the parameter and no global variables are touched
5247 which makes the builtin a "const" function. Requiring the
5248 builtin to have the "const" attribute makes it unnecessary
5249 to call mark_call_clobbered. */
5250 gcc_assert (TREE_READONLY (builtin_decl));
5253 if (alignment_support_scheme == dr_explicit_realign)
5254 return msq;
5256 gcc_assert (!compute_in_loop);
5257 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5260 /* 5. Create msq = phi <msq_init, lsq> in loop */
5262 pe = loop_preheader_edge (containing_loop);
5263 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5264 msq = make_ssa_name (vec_dest);
5265 phi_stmt = create_phi_node (msq, containing_loop->header);
5266 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5268 return msq;
5272 /* Function vect_grouped_load_supported.
5274 COUNT is the size of the load group (the number of statements plus the
5275 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5276 only one statement, with a gap of COUNT - 1.
5278 Returns true if a suitable permute exists. */
5280 bool
5281 vect_grouped_load_supported (tree vectype, bool single_element_p,
5282 unsigned HOST_WIDE_INT count)
5284 machine_mode mode = TYPE_MODE (vectype);
5286 /* If this is single-element interleaving with an element distance
5287 that leaves unused vector loads around punt - we at least create
5288 very sub-optimal code in that case (and blow up memory,
5289 see PR65518). */
5290 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5292 if (dump_enabled_p ())
5293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5294 "single-element interleaving not supported "
5295 "for not adjacent vector loads\n");
5296 return false;
5299 /* vect_permute_load_chain requires the group size to be equal to 3 or
5300 be a power of two. */
5301 if (count != 3 && exact_log2 (count) == -1)
5303 if (dump_enabled_p ())
5304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5305 "the size of the group of accesses"
5306 " is not a power of 2 or not equal to 3\n");
5307 return false;
5310 /* Check that the permutation is supported. */
5311 if (VECTOR_MODE_P (mode))
5313 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5314 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5316 if (count == 3)
5318 unsigned int k;
5319 for (k = 0; k < 3; k++)
5321 for (i = 0; i < nelt; i++)
5322 if (3 * i + k < 2 * nelt)
5323 sel[i] = 3 * i + k;
5324 else
5325 sel[i] = 0;
5326 if (!can_vec_perm_p (mode, false, sel))
5328 if (dump_enabled_p ())
5329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5330 "shuffle of 3 loads is not supported by"
5331 " target\n");
5332 return false;
5334 for (i = 0, j = 0; i < nelt; i++)
5335 if (3 * i + k < 2 * nelt)
5336 sel[i] = i;
5337 else
5338 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5339 if (!can_vec_perm_p (mode, false, sel))
5341 if (dump_enabled_p ())
5342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5343 "shuffle of 3 loads is not supported by"
5344 " target\n");
5345 return false;
5348 return true;
5350 else
5352 /* If length is not equal to 3 then only power of 2 is supported. */
5353 gcc_assert (pow2p_hwi (count));
5354 for (i = 0; i < nelt; i++)
5355 sel[i] = i * 2;
5356 if (can_vec_perm_p (mode, false, sel))
5358 for (i = 0; i < nelt; i++)
5359 sel[i] = i * 2 + 1;
5360 if (can_vec_perm_p (mode, false, sel))
5361 return true;
5366 if (dump_enabled_p ())
5367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5368 "extract even/odd not supported by target\n");
5369 return false;
5372 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5373 type VECTYPE. */
5375 bool
5376 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5378 return vect_lanes_optab_supported_p ("vec_load_lanes",
5379 vec_load_lanes_optab,
5380 vectype, count);
5383 /* Function vect_permute_load_chain.
5385 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5386 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5387 the input data correctly. Return the final references for loads in
5388 RESULT_CHAIN.
5390 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5391 The input is 4 vectors each containing 8 elements. We assign a number to each
5392 element, the input sequence is:
5394 1st vec: 0 1 2 3 4 5 6 7
5395 2nd vec: 8 9 10 11 12 13 14 15
5396 3rd vec: 16 17 18 19 20 21 22 23
5397 4th vec: 24 25 26 27 28 29 30 31
5399 The output sequence should be:
5401 1st vec: 0 4 8 12 16 20 24 28
5402 2nd vec: 1 5 9 13 17 21 25 29
5403 3rd vec: 2 6 10 14 18 22 26 30
5404 4th vec: 3 7 11 15 19 23 27 31
5406 i.e., the first output vector should contain the first elements of each
5407 interleaving group, etc.
5409 We use extract_even/odd instructions to create such output. The input of
5410 each extract_even/odd operation is two vectors
5411 1st vec 2nd vec
5412 0 1 2 3 4 5 6 7
5414 and the output is the vector of extracted even/odd elements. The output of
5415 extract_even will be: 0 2 4 6
5416 and of extract_odd: 1 3 5 7
5419 The permutation is done in log LENGTH stages. In each stage extract_even
5420 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5421 their order. In our example,
5423 E1: extract_even (1st vec, 2nd vec)
5424 E2: extract_odd (1st vec, 2nd vec)
5425 E3: extract_even (3rd vec, 4th vec)
5426 E4: extract_odd (3rd vec, 4th vec)
5428 The output for the first stage will be:
5430 E1: 0 2 4 6 8 10 12 14
5431 E2: 1 3 5 7 9 11 13 15
5432 E3: 16 18 20 22 24 26 28 30
5433 E4: 17 19 21 23 25 27 29 31
5435 In order to proceed and create the correct sequence for the next stage (or
5436 for the correct output, if the second stage is the last one, as in our
5437 example), we first put the output of extract_even operation and then the
5438 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5439 The input for the second stage is:
5441 1st vec (E1): 0 2 4 6 8 10 12 14
5442 2nd vec (E3): 16 18 20 22 24 26 28 30
5443 3rd vec (E2): 1 3 5 7 9 11 13 15
5444 4th vec (E4): 17 19 21 23 25 27 29 31
5446 The output of the second stage:
5448 E1: 0 4 8 12 16 20 24 28
5449 E2: 2 6 10 14 18 22 26 30
5450 E3: 1 5 9 13 17 21 25 29
5451 E4: 3 7 11 15 19 23 27 31
5453 And RESULT_CHAIN after reordering:
5455 1st vec (E1): 0 4 8 12 16 20 24 28
5456 2nd vec (E3): 1 5 9 13 17 21 25 29
5457 3rd vec (E2): 2 6 10 14 18 22 26 30
5458 4th vec (E4): 3 7 11 15 19 23 27 31. */
5460 static void
5461 vect_permute_load_chain (vec<tree> dr_chain,
5462 unsigned int length,
5463 gimple *stmt,
5464 gimple_stmt_iterator *gsi,
5465 vec<tree> *result_chain)
5467 tree data_ref, first_vect, second_vect;
5468 tree perm_mask_even, perm_mask_odd;
5469 tree perm3_mask_low, perm3_mask_high;
5470 gimple *perm_stmt;
5471 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5472 unsigned int i, j, log_length = exact_log2 (length);
5473 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5474 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5476 result_chain->quick_grow (length);
5477 memcpy (result_chain->address (), dr_chain.address (),
5478 length * sizeof (tree));
5480 if (length == 3)
5482 unsigned int k;
5484 for (k = 0; k < 3; k++)
5486 for (i = 0; i < nelt; i++)
5487 if (3 * i + k < 2 * nelt)
5488 sel[i] = 3 * i + k;
5489 else
5490 sel[i] = 0;
5491 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5493 for (i = 0, j = 0; i < nelt; i++)
5494 if (3 * i + k < 2 * nelt)
5495 sel[i] = i;
5496 else
5497 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5499 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5501 first_vect = dr_chain[0];
5502 second_vect = dr_chain[1];
5504 /* Create interleaving stmt (low part of):
5505 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5506 ...}> */
5507 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5508 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5509 second_vect, perm3_mask_low);
5510 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5512 /* Create interleaving stmt (high part of):
5513 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5514 ...}> */
5515 first_vect = data_ref;
5516 second_vect = dr_chain[2];
5517 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5518 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5519 second_vect, perm3_mask_high);
5520 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5521 (*result_chain)[k] = data_ref;
5524 else
5526 /* If length is not equal to 3 then only power of 2 is supported. */
5527 gcc_assert (pow2p_hwi (length));
5529 for (i = 0; i < nelt; ++i)
5530 sel[i] = i * 2;
5531 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5533 for (i = 0; i < nelt; ++i)
5534 sel[i] = i * 2 + 1;
5535 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5537 for (i = 0; i < log_length; i++)
5539 for (j = 0; j < length; j += 2)
5541 first_vect = dr_chain[j];
5542 second_vect = dr_chain[j+1];
5544 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5545 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5546 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5547 first_vect, second_vect,
5548 perm_mask_even);
5549 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5550 (*result_chain)[j/2] = data_ref;
5552 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5553 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5554 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5555 first_vect, second_vect,
5556 perm_mask_odd);
5557 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5558 (*result_chain)[j/2+length/2] = data_ref;
5560 memcpy (dr_chain.address (), result_chain->address (),
5561 length * sizeof (tree));
5566 /* Function vect_shift_permute_load_chain.
5568 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5569 sequence of stmts to reorder the input data accordingly.
5570 Return the final references for loads in RESULT_CHAIN.
5571 Return true if successed, false otherwise.
5573 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5574 The input is 3 vectors each containing 8 elements. We assign a
5575 number to each element, the input sequence is:
5577 1st vec: 0 1 2 3 4 5 6 7
5578 2nd vec: 8 9 10 11 12 13 14 15
5579 3rd vec: 16 17 18 19 20 21 22 23
5581 The output sequence should be:
5583 1st vec: 0 3 6 9 12 15 18 21
5584 2nd vec: 1 4 7 10 13 16 19 22
5585 3rd vec: 2 5 8 11 14 17 20 23
5587 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5589 First we shuffle all 3 vectors to get correct elements order:
5591 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5592 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5593 3rd vec: (16 19 22) (17 20 23) (18 21)
5595 Next we unite and shift vector 3 times:
5597 1st step:
5598 shift right by 6 the concatenation of:
5599 "1st vec" and "2nd vec"
5600 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5601 "2nd vec" and "3rd vec"
5602 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5603 "3rd vec" and "1st vec"
5604 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5605 | New vectors |
5607 So that now new vectors are:
5609 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5610 2nd vec: (10 13) (16 19 22) (17 20 23)
5611 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5613 2nd step:
5614 shift right by 5 the concatenation of:
5615 "1st vec" and "3rd vec"
5616 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5617 "2nd vec" and "1st vec"
5618 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5619 "3rd vec" and "2nd vec"
5620 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5621 | New vectors |
5623 So that now new vectors are:
5625 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5626 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5627 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5629 3rd step:
5630 shift right by 5 the concatenation of:
5631 "1st vec" and "1st vec"
5632 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5633 shift right by 3 the concatenation of:
5634 "2nd vec" and "2nd vec"
5635 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5636 | New vectors |
5638 So that now all vectors are READY:
5639 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5640 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5641 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5643 This algorithm is faster than one in vect_permute_load_chain if:
5644 1. "shift of a concatination" is faster than general permutation.
5645 This is usually so.
5646 2. The TARGET machine can't execute vector instructions in parallel.
5647 This is because each step of the algorithm depends on previous.
5648 The algorithm in vect_permute_load_chain is much more parallel.
5650 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5653 static bool
5654 vect_shift_permute_load_chain (vec<tree> dr_chain,
5655 unsigned int length,
5656 gimple *stmt,
5657 gimple_stmt_iterator *gsi,
5658 vec<tree> *result_chain)
5660 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5661 tree perm2_mask1, perm2_mask2, perm3_mask;
5662 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5663 gimple *perm_stmt;
5665 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5666 unsigned int i;
5667 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5668 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5669 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5670 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5672 result_chain->quick_grow (length);
5673 memcpy (result_chain->address (), dr_chain.address (),
5674 length * sizeof (tree));
5676 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5678 unsigned int j, log_length = exact_log2 (length);
5679 for (i = 0; i < nelt / 2; ++i)
5680 sel[i] = i * 2;
5681 for (i = 0; i < nelt / 2; ++i)
5682 sel[nelt / 2 + i] = i * 2 + 1;
5683 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5685 if (dump_enabled_p ())
5686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5687 "shuffle of 2 fields structure is not \
5688 supported by target\n");
5689 return false;
5691 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5693 for (i = 0; i < nelt / 2; ++i)
5694 sel[i] = i * 2 + 1;
5695 for (i = 0; i < nelt / 2; ++i)
5696 sel[nelt / 2 + i] = i * 2;
5697 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5699 if (dump_enabled_p ())
5700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5701 "shuffle of 2 fields structure is not \
5702 supported by target\n");
5703 return false;
5705 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5707 /* Generating permutation constant to shift all elements.
5708 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5709 for (i = 0; i < nelt; i++)
5710 sel[i] = nelt / 2 + i;
5711 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5713 if (dump_enabled_p ())
5714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5715 "shift permutation is not supported by target\n");
5716 return false;
5718 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5720 /* Generating permutation constant to select vector from 2.
5721 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5722 for (i = 0; i < nelt / 2; i++)
5723 sel[i] = i;
5724 for (i = nelt / 2; i < nelt; i++)
5725 sel[i] = nelt + i;
5726 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5728 if (dump_enabled_p ())
5729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5730 "select is not supported by target\n");
5731 return false;
5733 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5735 for (i = 0; i < log_length; i++)
5737 for (j = 0; j < length; j += 2)
5739 first_vect = dr_chain[j];
5740 second_vect = dr_chain[j + 1];
5742 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5743 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5744 first_vect, first_vect,
5745 perm2_mask1);
5746 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5747 vect[0] = data_ref;
5749 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5750 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5751 second_vect, second_vect,
5752 perm2_mask2);
5753 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5754 vect[1] = data_ref;
5756 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5757 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5758 vect[0], vect[1], shift1_mask);
5759 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5760 (*result_chain)[j/2 + length/2] = data_ref;
5762 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5763 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5764 vect[0], vect[1], select_mask);
5765 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5766 (*result_chain)[j/2] = data_ref;
5768 memcpy (dr_chain.address (), result_chain->address (),
5769 length * sizeof (tree));
5771 return true;
5773 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5775 unsigned int k = 0, l = 0;
5777 /* Generating permutation constant to get all elements in rigth order.
5778 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5779 for (i = 0; i < nelt; i++)
5781 if (3 * k + (l % 3) >= nelt)
5783 k = 0;
5784 l += (3 - (nelt % 3));
5786 sel[i] = 3 * k + (l % 3);
5787 k++;
5789 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5791 if (dump_enabled_p ())
5792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5793 "shuffle of 3 fields structure is not \
5794 supported by target\n");
5795 return false;
5797 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5799 /* Generating permutation constant to shift all elements.
5800 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5801 for (i = 0; i < nelt; i++)
5802 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5803 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5805 if (dump_enabled_p ())
5806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5807 "shift permutation is not supported by target\n");
5808 return false;
5810 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5812 /* Generating permutation constant to shift all elements.
5813 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5814 for (i = 0; i < nelt; i++)
5815 sel[i] = 2 * (nelt / 3) + 1 + i;
5816 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5818 if (dump_enabled_p ())
5819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5820 "shift permutation is not supported by target\n");
5821 return false;
5823 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5825 /* Generating permutation constant to shift all elements.
5826 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5827 for (i = 0; i < nelt; i++)
5828 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5829 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5831 if (dump_enabled_p ())
5832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5833 "shift permutation is not supported by target\n");
5834 return false;
5836 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5838 /* Generating permutation constant to shift all elements.
5839 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5840 for (i = 0; i < nelt; i++)
5841 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5842 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5844 if (dump_enabled_p ())
5845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5846 "shift permutation is not supported by target\n");
5847 return false;
5849 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5851 for (k = 0; k < 3; k++)
5853 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5854 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5855 dr_chain[k], dr_chain[k],
5856 perm3_mask);
5857 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5858 vect[k] = data_ref;
5861 for (k = 0; k < 3; k++)
5863 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5864 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5865 vect[k % 3], vect[(k + 1) % 3],
5866 shift1_mask);
5867 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5868 vect_shift[k] = data_ref;
5871 for (k = 0; k < 3; k++)
5873 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5874 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5875 vect_shift[(4 - k) % 3],
5876 vect_shift[(3 - k) % 3],
5877 shift2_mask);
5878 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5879 vect[k] = data_ref;
5882 (*result_chain)[3 - (nelt % 3)] = vect[2];
5884 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5885 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5886 vect[0], shift3_mask);
5887 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5888 (*result_chain)[nelt % 3] = data_ref;
5890 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5891 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5892 vect[1], shift4_mask);
5893 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5894 (*result_chain)[0] = data_ref;
5895 return true;
5897 return false;
5900 /* Function vect_transform_grouped_load.
5902 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5903 to perform their permutation and ascribe the result vectorized statements to
5904 the scalar statements.
5907 void
5908 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5909 gimple_stmt_iterator *gsi)
5911 machine_mode mode;
5912 vec<tree> result_chain = vNULL;
5914 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5915 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5916 vectors, that are ready for vector computation. */
5917 result_chain.create (size);
5919 /* If reassociation width for vector type is 2 or greater target machine can
5920 execute 2 or more vector instructions in parallel. Otherwise try to
5921 get chain for loads group using vect_shift_permute_load_chain. */
5922 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5923 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5924 || pow2p_hwi (size)
5925 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5926 gsi, &result_chain))
5927 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5928 vect_record_grouped_load_vectors (stmt, result_chain);
5929 result_chain.release ();
5932 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5933 generated as part of the vectorization of STMT. Assign the statement
5934 for each vector to the associated scalar statement. */
5936 void
5937 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5939 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5940 gimple *next_stmt, *new_stmt;
5941 unsigned int i, gap_count;
5942 tree tmp_data_ref;
5944 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5945 Since we scan the chain starting from it's first node, their order
5946 corresponds the order of data-refs in RESULT_CHAIN. */
5947 next_stmt = first_stmt;
5948 gap_count = 1;
5949 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5951 if (!next_stmt)
5952 break;
5954 /* Skip the gaps. Loads created for the gaps will be removed by dead
5955 code elimination pass later. No need to check for the first stmt in
5956 the group, since it always exists.
5957 GROUP_GAP is the number of steps in elements from the previous
5958 access (if there is no gap GROUP_GAP is 1). We skip loads that
5959 correspond to the gaps. */
5960 if (next_stmt != first_stmt
5961 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5963 gap_count++;
5964 continue;
5967 while (next_stmt)
5969 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5970 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5971 copies, and we put the new vector statement in the first available
5972 RELATED_STMT. */
5973 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5974 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5975 else
5977 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5979 gimple *prev_stmt =
5980 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5981 gimple *rel_stmt =
5982 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5983 while (rel_stmt)
5985 prev_stmt = rel_stmt;
5986 rel_stmt =
5987 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5990 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5991 new_stmt;
5995 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5996 gap_count = 1;
5997 /* If NEXT_STMT accesses the same DR as the previous statement,
5998 put the same TMP_DATA_REF as its vectorized statement; otherwise
5999 get the next data-ref from RESULT_CHAIN. */
6000 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6001 break;
6006 /* Function vect_force_dr_alignment_p.
6008 Returns whether the alignment of a DECL can be forced to be aligned
6009 on ALIGNMENT bit boundary. */
6011 bool
6012 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6014 if (!VAR_P (decl))
6015 return false;
6017 if (decl_in_symtab_p (decl)
6018 && !symtab_node::get (decl)->can_increase_alignment_p ())
6019 return false;
6021 if (TREE_STATIC (decl))
6022 return (alignment <= MAX_OFILE_ALIGNMENT);
6023 else
6024 return (alignment <= MAX_STACK_ALIGNMENT);
6028 /* Return whether the data reference DR is supported with respect to its
6029 alignment.
6030 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6031 it is aligned, i.e., check if it is possible to vectorize it with different
6032 alignment. */
6034 enum dr_alignment_support
6035 vect_supportable_dr_alignment (struct data_reference *dr,
6036 bool check_aligned_accesses)
6038 gimple *stmt = DR_STMT (dr);
6039 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6040 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6041 machine_mode mode = TYPE_MODE (vectype);
6042 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6043 struct loop *vect_loop = NULL;
6044 bool nested_in_vect_loop = false;
6046 if (aligned_access_p (dr) && !check_aligned_accesses)
6047 return dr_aligned;
6049 /* For now assume all conditional loads/stores support unaligned
6050 access without any special code. */
6051 if (is_gimple_call (stmt)
6052 && gimple_call_internal_p (stmt)
6053 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6054 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6055 return dr_unaligned_supported;
6057 if (loop_vinfo)
6059 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6060 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6063 /* Possibly unaligned access. */
6065 /* We can choose between using the implicit realignment scheme (generating
6066 a misaligned_move stmt) and the explicit realignment scheme (generating
6067 aligned loads with a REALIGN_LOAD). There are two variants to the
6068 explicit realignment scheme: optimized, and unoptimized.
6069 We can optimize the realignment only if the step between consecutive
6070 vector loads is equal to the vector size. Since the vector memory
6071 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6072 is guaranteed that the misalignment amount remains the same throughout the
6073 execution of the vectorized loop. Therefore, we can create the
6074 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6075 at the loop preheader.
6077 However, in the case of outer-loop vectorization, when vectorizing a
6078 memory access in the inner-loop nested within the LOOP that is now being
6079 vectorized, while it is guaranteed that the misalignment of the
6080 vectorized memory access will remain the same in different outer-loop
6081 iterations, it is *not* guaranteed that is will remain the same throughout
6082 the execution of the inner-loop. This is because the inner-loop advances
6083 with the original scalar step (and not in steps of VS). If the inner-loop
6084 step happens to be a multiple of VS, then the misalignment remains fixed
6085 and we can use the optimized realignment scheme. For example:
6087 for (i=0; i<N; i++)
6088 for (j=0; j<M; j++)
6089 s += a[i+j];
6091 When vectorizing the i-loop in the above example, the step between
6092 consecutive vector loads is 1, and so the misalignment does not remain
6093 fixed across the execution of the inner-loop, and the realignment cannot
6094 be optimized (as illustrated in the following pseudo vectorized loop):
6096 for (i=0; i<N; i+=4)
6097 for (j=0; j<M; j++){
6098 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6099 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6100 // (assuming that we start from an aligned address).
6103 We therefore have to use the unoptimized realignment scheme:
6105 for (i=0; i<N; i+=4)
6106 for (j=k; j<M; j+=4)
6107 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6108 // that the misalignment of the initial address is
6109 // 0).
6111 The loop can then be vectorized as follows:
6113 for (k=0; k<4; k++){
6114 rt = get_realignment_token (&vp[k]);
6115 for (i=0; i<N; i+=4){
6116 v1 = vp[i+k];
6117 for (j=k; j<M; j+=4){
6118 v2 = vp[i+j+VS-1];
6119 va = REALIGN_LOAD <v1,v2,rt>;
6120 vs += va;
6121 v1 = v2;
6124 } */
6126 if (DR_IS_READ (dr))
6128 bool is_packed = false;
6129 tree type = (TREE_TYPE (DR_REF (dr)));
6131 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6132 && (!targetm.vectorize.builtin_mask_for_load
6133 || targetm.vectorize.builtin_mask_for_load ()))
6135 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6137 /* If we are doing SLP then the accesses need not have the
6138 same alignment, instead it depends on the SLP group size. */
6139 if (loop_vinfo
6140 && STMT_SLP_TYPE (stmt_info)
6141 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6142 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6143 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6145 else if (!loop_vinfo
6146 || (nested_in_vect_loop
6147 && (TREE_INT_CST_LOW (DR_STEP (dr))
6148 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6149 return dr_explicit_realign;
6150 else
6151 return dr_explicit_realign_optimized;
6153 if (!known_alignment_for_access_p (dr))
6154 is_packed = not_size_aligned (DR_REF (dr));
6156 if ((TYPE_USER_ALIGN (type) && !is_packed)
6157 || targetm.vectorize.
6158 support_vector_misalignment (mode, type,
6159 DR_MISALIGNMENT (dr), is_packed))
6160 /* Can't software pipeline the loads, but can at least do them. */
6161 return dr_unaligned_supported;
6163 else
6165 bool is_packed = false;
6166 tree type = (TREE_TYPE (DR_REF (dr)));
6168 if (!known_alignment_for_access_p (dr))
6169 is_packed = not_size_aligned (DR_REF (dr));
6171 if ((TYPE_USER_ALIGN (type) && !is_packed)
6172 || targetm.vectorize.
6173 support_vector_misalignment (mode, type,
6174 DR_MISALIGNMENT (dr), is_packed))
6175 return dr_unaligned_supported;
6178 /* Unsupported. */
6179 return dr_unaligned_unsupported;