2015-12-01 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobb59b40cf8d53e054b1d110c5bb8d58078d5003eb
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
270 return true;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 return true;
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
344 if (dist == 0)
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
390 continue;
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
444 return true;
447 return false;
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
459 unsigned int i;
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
479 return true;
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
503 if (dra == drb)
504 return false;
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
554 group. */
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
560 continue;
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 continue;
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 if (!dr_b)
576 return false;
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
585 continue;
586 unsigned i;
587 gimple *store;
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
597 return false;
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
607 return false;
609 free_dependence_relation (ddr);
612 return true;
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632 store = NULL;
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
636 if (store)
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 return false;
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
647 bool res = true;
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
651 slp_tree load;
652 unsigned int i;
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
655 store
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
659 res = false;
660 break;
663 /* Unset the visited flag. */
664 if (store)
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
668 return res;
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
675 Output:
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
691 tree vectype;
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
694 tree aligned_to;
695 unsigned HOST_WIDE_INT alignment;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE, vect_location,
699 "vect_compute_data_ref_alignment:\n");
701 if (loop_vinfo)
702 loop = LOOP_VINFO_LOOP (loop_vinfo);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr)))
708 misalign = DR_INIT (dr);
709 aligned_to = DR_ALIGNED_TO (dr);
710 base_addr = DR_BASE_ADDRESS (dr);
711 vectype = STMT_VINFO_VECTYPE (stmt_info);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop && nested_in_vect_loop_p (loop, stmt))
721 tree step = DR_STEP (dr);
723 if (tree_fits_shwi_p (step)
724 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "inner step divides the vector-size.\n");
729 misalign = STMT_VINFO_DR_INIT (stmt_info);
730 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
731 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
733 else
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "inner step doesn't divide the vector-size.\n");
738 misalign = NULL_TREE;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
746 else
748 tree step = DR_STEP (dr);
749 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
751 if (tree_fits_shwi_p (step)
752 && ((tree_to_shwi (step) * vf)
753 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "step doesn't divide the vector-size.\n");
758 misalign = NULL_TREE;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
764 base = ref;
765 while (handled_component_p (base))
766 base = TREE_OPERAND (base, 0);
767 if (TREE_CODE (base) == MEM_REF)
768 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
770 unsigned int base_alignment = get_object_alignment (base);
772 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
773 DR_VECT_AUX (dr)->base_element_aligned = true;
775 alignment = TYPE_ALIGN_UNIT (vectype);
777 if ((compare_tree_int (aligned_to, alignment) < 0)
778 || !misalign)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
790 if (base_alignment < TYPE_ALIGN (vectype))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base, 1))
795 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
796 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
805 dump_printf (MSG_NOTE, "\n");
807 return true;
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
817 dump_printf (MSG_NOTE, "\n");
820 DR_VECT_AUX (dr)->base_decl = base;
821 DR_VECT_AUX (dr)->base_misaligned = true;
822 DR_VECT_AUX (dr)->base_element_aligned = true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
830 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
834 /* PLUS because DR_STEP was negative. */
835 misalign = size_binop (PLUS_EXPR, misalign, offset);
838 SET_DR_MISALIGNMENT (dr,
839 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
849 return true;
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
861 static void
862 vect_update_misalignment_for_peel (struct data_reference *dr,
863 struct data_reference *dr_peel, int npeel)
865 unsigned int i;
866 vec<dr_p> same_align_drs;
867 struct data_reference *current_dr;
868 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
869 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
870 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
871 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
876 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
878 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
882 same_align_drs
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
884 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
886 if (current_dr != dr)
887 continue;
888 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
889 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
890 SET_DR_MISALIGNMENT (dr, 0);
891 return;
894 if (known_alignment_for_access_p (dr)
895 && known_alignment_for_access_p (dr_peel))
897 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
898 int misal = DR_MISALIGNMENT (dr);
899 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
900 misal += negative ? -npeel * dr_size : npeel * dr_size;
901 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
902 SET_DR_MISALIGNMENT (dr, misal);
903 return;
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
916 static bool
917 verify_data_ref_alignment (data_reference_p dr)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr, false);
921 if (!supportable_dr_alignment)
923 if (dump_enabled_p ())
925 if (DR_IS_READ (dr))
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
927 "not vectorized: unsupported unaligned load.");
928 else
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "not vectorized: unsupported unaligned "
931 "store.");
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
934 DR_REF (dr));
935 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
937 return false;
940 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location,
942 "Vectorizing an unaligned access.\n");
944 return true;
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
952 bool
953 vect_verify_datarefs_alignment (loop_vec_info vinfo)
955 vec<data_reference_p> datarefs = vinfo->datarefs;
956 struct data_reference *dr;
957 unsigned int i;
959 FOR_EACH_VEC_ELT (datarefs, i, dr)
961 gimple *stmt = DR_STMT (dr);
962 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
965 continue;
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
969 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
970 continue;
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
976 continue;
978 if (! verify_data_ref_alignment (dr))
979 return false;
982 return true;
985 /* Given an memory reference EXP return whether its alignment is less
986 than its size. */
988 static bool
989 not_size_aligned (tree exp)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
992 return true;
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
995 > get_object_alignment (exp));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1003 static bool
1004 vector_alignment_reachable_p (struct data_reference *dr)
1006 gimple *stmt = DR_STMT (dr);
1007 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size, mis_in_elements;
1016 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr))
1020 return false;
1022 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1023 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1025 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1026 return false;
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1033 HOST_WIDE_INT elmsize =
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE, vect_location,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1039 dump_printf (MSG_NOTE,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1042 if (DR_MISALIGNMENT (dr) % elmsize)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "data size does not divide the misalignment.\n");
1047 return false;
1051 if (!known_alignment_for_access_p (dr))
1053 tree type = TREE_TYPE (DR_REF (dr));
1054 bool is_packed = not_size_aligned (DR_REF (dr));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Unknown misalignment, is_packed = %d\n",is_packed);
1058 if ((TYPE_USER_ALIGN (type) && !is_packed)
1059 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1060 return true;
1061 else
1062 return false;
1065 return true;
1069 /* Calculate the cost of the memory access represented by DR. */
1071 static void
1072 vect_get_data_access_cost (struct data_reference *dr,
1073 unsigned int *inside_cost,
1074 unsigned int *outside_cost,
1075 stmt_vector_for_cost *body_cost_vec)
1077 gimple *stmt = DR_STMT (dr);
1078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1079 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1081 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1082 int ncopies = vf / nunits;
1084 if (DR_IS_READ (dr))
1085 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1086 NULL, body_cost_vec, false);
1087 else
1088 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1097 typedef struct _vect_peel_info
1099 int npeel;
1100 struct data_reference *dr;
1101 unsigned int count;
1102 } *vect_peel_info;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info;
1107 unsigned int inside_cost;
1108 unsigned int outside_cost;
1109 stmt_vector_for_cost body_cost_vec;
1110 } *vect_peel_extended_info;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1117 static inline hashval_t hash (const _vect_peel_info *);
1118 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1121 inline hashval_t
1122 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1124 return (hashval_t) peel_info->npeel;
1127 inline bool
1128 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1130 return (a->npeel == b->npeel);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1136 static void
1137 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1138 loop_vec_info loop_vinfo, struct data_reference *dr,
1139 int npeel)
1141 struct _vect_peel_info elem, *slot;
1142 _vect_peel_info **new_slot;
1143 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1145 elem.npeel = npeel;
1146 slot = peeling_htab->find (&elem);
1147 if (slot)
1148 slot->count++;
1149 else
1151 slot = XNEW (struct _vect_peel_info);
1152 slot->npeel = npeel;
1153 slot->dr = dr;
1154 slot->count = 1;
1155 new_slot = peeling_htab->find_slot (slot, INSERT);
1156 *new_slot = slot;
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1161 slot->count += VECT_MAX_COST;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1170 _vect_peel_extended_info *max)
1172 vect_peel_info elem = *slot;
1174 if (elem->count > max->peel_info.count
1175 || (elem->count == max->peel_info.count
1176 && max->peel_info.npeel > elem->npeel))
1178 max->peel_info.npeel = elem->npeel;
1179 max->peel_info.count = elem->count;
1180 max->peel_info.dr = elem->dr;
1183 return 1;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1192 _vect_peel_extended_info *min)
1194 vect_peel_info elem = *slot;
1195 int save_misalignment, dummy;
1196 unsigned int inside_cost = 0, outside_cost = 0, i;
1197 gimple *stmt = DR_STMT (elem->dr);
1198 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1199 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1200 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1201 struct data_reference *dr;
1202 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1204 prologue_cost_vec.create (2);
1205 body_cost_vec.create (2);
1206 epilogue_cost_vec.create (2);
1208 FOR_EACH_VEC_ELT (datarefs, i, dr)
1210 stmt = DR_STMT (dr);
1211 stmt_info = vinfo_for_stmt (stmt);
1212 /* For interleaving, only the alignment of the first access
1213 matters. */
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1215 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1216 continue;
1218 save_misalignment = DR_MISALIGNMENT (dr);
1219 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1220 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1221 &body_cost_vec);
1222 SET_DR_MISALIGNMENT (dr, save_misalignment);
1225 outside_cost += vect_get_known_peeling_cost
1226 (loop_vinfo, elem->npeel, &dummy,
1227 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1228 &prologue_cost_vec, &epilogue_cost_vec);
1230 /* Prologue and epilogue costs are added to the target model later.
1231 These costs depend only on the scalar iteration cost, the
1232 number of peeling iterations finally chosen, and the number of
1233 misaligned statements. So discard the information found here. */
1234 prologue_cost_vec.release ();
1235 epilogue_cost_vec.release ();
1237 if (inside_cost < min->inside_cost
1238 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1240 min->inside_cost = inside_cost;
1241 min->outside_cost = outside_cost;
1242 min->body_cost_vec.release ();
1243 min->body_cost_vec = body_cost_vec;
1244 min->peel_info.dr = elem->dr;
1245 min->peel_info.npeel = elem->npeel;
1247 else
1248 body_cost_vec.release ();
1250 return 1;
1254 /* Choose best peeling option by traversing peeling hash table and either
1255 choosing an option with the lowest cost (if cost model is enabled) or the
1256 option that aligns as many accesses as possible. */
1258 static struct data_reference *
1259 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1260 loop_vec_info loop_vinfo,
1261 unsigned int *npeel,
1262 stmt_vector_for_cost *body_cost_vec)
1264 struct _vect_peel_extended_info res;
1266 res.peel_info.dr = NULL;
1267 res.body_cost_vec = stmt_vector_for_cost ();
1269 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1271 res.inside_cost = INT_MAX;
1272 res.outside_cost = INT_MAX;
1273 peeling_htab->traverse <_vect_peel_extended_info *,
1274 vect_peeling_hash_get_lowest_cost> (&res);
1276 else
1278 res.peel_info.count = 0;
1279 peeling_htab->traverse <_vect_peel_extended_info *,
1280 vect_peeling_hash_get_most_frequent> (&res);
1283 *npeel = res.peel_info.npeel;
1284 *body_cost_vec = res.body_cost_vec;
1285 return res.peel_info.dr;
1289 /* Function vect_enhance_data_refs_alignment
1291 This pass will use loop versioning and loop peeling in order to enhance
1292 the alignment of data references in the loop.
1294 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1295 original loop is to be vectorized. Any other loops that are created by
1296 the transformations performed in this pass - are not supposed to be
1297 vectorized. This restriction will be relaxed.
1299 This pass will require a cost model to guide it whether to apply peeling
1300 or versioning or a combination of the two. For example, the scheme that
1301 intel uses when given a loop with several memory accesses, is as follows:
1302 choose one memory access ('p') which alignment you want to force by doing
1303 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1304 other accesses are not necessarily aligned, or (2) use loop versioning to
1305 generate one loop in which all accesses are aligned, and another loop in
1306 which only 'p' is necessarily aligned.
1308 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1309 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1310 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1312 Devising a cost model is the most critical aspect of this work. It will
1313 guide us on which access to peel for, whether to use loop versioning, how
1314 many versions to create, etc. The cost model will probably consist of
1315 generic considerations as well as target specific considerations (on
1316 powerpc for example, misaligned stores are more painful than misaligned
1317 loads).
1319 Here are the general steps involved in alignment enhancements:
1321 -- original loop, before alignment analysis:
1322 for (i=0; i<N; i++){
1323 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1324 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1327 -- After vect_compute_data_refs_alignment:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = 3
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- Possibility 1: we do loop versioning:
1334 if (p is aligned) {
1335 for (i=0; i<N; i++){ # loop 1A
1336 x = q[i]; # DR_MISALIGNMENT(q) = 3
1337 p[i] = y; # DR_MISALIGNMENT(p) = 0
1340 else {
1341 for (i=0; i<N; i++){ # loop 1B
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1347 -- Possibility 2: we do loop peeling:
1348 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1349 x = q[i];
1350 p[i] = y;
1352 for (i = 3; i < N; i++){ # loop 2A
1353 x = q[i]; # DR_MISALIGNMENT(q) = 0
1354 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1357 -- Possibility 3: combination of loop peeling and versioning:
1358 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1359 x = q[i];
1360 p[i] = y;
1362 if (p is aligned) {
1363 for (i = 3; i<N; i++){ # loop 3A
1364 x = q[i]; # DR_MISALIGNMENT(q) = 0
1365 p[i] = y; # DR_MISALIGNMENT(p) = 0
1368 else {
1369 for (i = 3; i<N; i++){ # loop 3B
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1375 These loops are later passed to loop_transform to be vectorized. The
1376 vectorizer will use the alignment information to guide the transformation
1377 (whether to generate regular loads/stores, or with special handling for
1378 misalignment). */
1380 bool
1381 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1383 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1384 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1385 enum dr_alignment_support supportable_dr_alignment;
1386 struct data_reference *dr0 = NULL, *first_store = NULL;
1387 struct data_reference *dr;
1388 unsigned int i, j;
1389 bool do_peeling = false;
1390 bool do_versioning = false;
1391 bool stat;
1392 gimple *stmt;
1393 stmt_vec_info stmt_info;
1394 unsigned int npeel = 0;
1395 bool all_misalignments_unknown = true;
1396 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1397 unsigned possible_npeel_number = 1;
1398 tree vectype;
1399 unsigned int nelements, mis, same_align_drs_max = 0;
1400 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1401 hash_table<peel_info_hasher> peeling_htab (1);
1403 if (dump_enabled_p ())
1404 dump_printf_loc (MSG_NOTE, vect_location,
1405 "=== vect_enhance_data_refs_alignment ===\n");
1407 /* Reset data so we can safely be called multiple times. */
1408 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1409 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1411 /* While cost model enhancements are expected in the future, the high level
1412 view of the code at this time is as follows:
1414 A) If there is a misaligned access then see if peeling to align
1415 this access can make all data references satisfy
1416 vect_supportable_dr_alignment. If so, update data structures
1417 as needed and return true.
1419 B) If peeling wasn't possible and there is a data reference with an
1420 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1421 then see if loop versioning checks can be used to make all data
1422 references satisfy vect_supportable_dr_alignment. If so, update
1423 data structures as needed and return true.
1425 C) If neither peeling nor versioning were successful then return false if
1426 any data reference does not satisfy vect_supportable_dr_alignment.
1428 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1430 Note, Possibility 3 above (which is peeling and versioning together) is not
1431 being done at this time. */
1433 /* (1) Peeling to force alignment. */
1435 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1436 Considerations:
1437 + How many accesses will become aligned due to the peeling
1438 - How many accesses will become unaligned due to the peeling,
1439 and the cost of misaligned accesses.
1440 - The cost of peeling (the extra runtime checks, the increase
1441 in code size). */
1443 FOR_EACH_VEC_ELT (datarefs, i, dr)
1445 stmt = DR_STMT (dr);
1446 stmt_info = vinfo_for_stmt (stmt);
1448 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1449 continue;
1451 /* For interleaving, only the alignment of the first access
1452 matters. */
1453 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1454 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1455 continue;
1457 /* For invariant accesses there is nothing to enhance. */
1458 if (integer_zerop (DR_STEP (dr)))
1459 continue;
1461 /* Strided accesses perform only component accesses, alignment is
1462 irrelevant for them. */
1463 if (STMT_VINFO_STRIDED_P (stmt_info)
1464 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1465 continue;
1467 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1468 do_peeling = vector_alignment_reachable_p (dr);
1469 if (do_peeling)
1471 if (known_alignment_for_access_p (dr))
1473 unsigned int npeel_tmp;
1474 bool negative = tree_int_cst_compare (DR_STEP (dr),
1475 size_zero_node) < 0;
1477 /* Save info about DR in the hash table. */
1478 vectype = STMT_VINFO_VECTYPE (stmt_info);
1479 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1480 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1481 TREE_TYPE (DR_REF (dr))));
1482 npeel_tmp = (negative
1483 ? (mis - nelements) : (nelements - mis))
1484 & (nelements - 1);
1486 /* For multiple types, it is possible that the bigger type access
1487 will have more than one peeling option. E.g., a loop with two
1488 types: one of size (vector size / 4), and the other one of
1489 size (vector size / 8). Vectorization factor will 8. If both
1490 access are misaligned by 3, the first one needs one scalar
1491 iteration to be aligned, and the second one needs 5. But the
1492 the first one will be aligned also by peeling 5 scalar
1493 iterations, and in that case both accesses will be aligned.
1494 Hence, except for the immediate peeling amount, we also want
1495 to try to add full vector size, while we don't exceed
1496 vectorization factor.
1497 We do this automtically for cost model, since we calculate cost
1498 for every peeling option. */
1499 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1501 if (STMT_SLP_TYPE (stmt_info))
1502 possible_npeel_number
1503 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1504 else
1505 possible_npeel_number = vf / nelements;
1508 /* Handle the aligned case. We may decide to align some other
1509 access, making DR unaligned. */
1510 if (DR_MISALIGNMENT (dr) == 0)
1512 npeel_tmp = 0;
1513 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514 possible_npeel_number++;
1517 for (j = 0; j < possible_npeel_number; j++)
1519 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1520 dr, npeel_tmp);
1521 npeel_tmp += nelements;
1524 all_misalignments_unknown = false;
1525 /* Data-ref that was chosen for the case that all the
1526 misalignments are unknown is not relevant anymore, since we
1527 have a data-ref with known alignment. */
1528 dr0 = NULL;
1530 else
1532 /* If we don't know any misalignment values, we prefer
1533 peeling for data-ref that has the maximum number of data-refs
1534 with the same alignment, unless the target prefers to align
1535 stores over load. */
1536 if (all_misalignments_unknown)
1538 unsigned same_align_drs
1539 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1540 if (!dr0
1541 || same_align_drs_max < same_align_drs)
1543 same_align_drs_max = same_align_drs;
1544 dr0 = dr;
1546 /* For data-refs with the same number of related
1547 accesses prefer the one where the misalign
1548 computation will be invariant in the outermost loop. */
1549 else if (same_align_drs_max == same_align_drs)
1551 struct loop *ivloop0, *ivloop;
1552 ivloop0 = outermost_invariant_loop_for_expr
1553 (loop, DR_BASE_ADDRESS (dr0));
1554 ivloop = outermost_invariant_loop_for_expr
1555 (loop, DR_BASE_ADDRESS (dr));
1556 if ((ivloop && !ivloop0)
1557 || (ivloop && ivloop0
1558 && flow_loop_nested_p (ivloop, ivloop0)))
1559 dr0 = dr;
1562 if (!first_store && DR_IS_WRITE (dr))
1563 first_store = dr;
1566 /* If there are both known and unknown misaligned accesses in the
1567 loop, we choose peeling amount according to the known
1568 accesses. */
1569 if (!supportable_dr_alignment)
1571 dr0 = dr;
1572 if (!first_store && DR_IS_WRITE (dr))
1573 first_store = dr;
1577 else
1579 if (!aligned_access_p (dr))
1581 if (dump_enabled_p ())
1582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1583 "vector alignment may not be reachable\n");
1584 break;
1589 /* Check if we can possibly peel the loop. */
1590 if (!vect_can_advance_ivs_p (loop_vinfo)
1591 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1592 || loop->inner)
1593 do_peeling = false;
1595 if (do_peeling
1596 && all_misalignments_unknown
1597 && vect_supportable_dr_alignment (dr0, false))
1599 /* Check if the target requires to prefer stores over loads, i.e., if
1600 misaligned stores are more expensive than misaligned loads (taking
1601 drs with same alignment into account). */
1602 if (first_store && DR_IS_READ (dr0))
1604 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1605 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1606 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1607 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1608 stmt_vector_for_cost dummy;
1609 dummy.create (2);
1611 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1612 &dummy);
1613 vect_get_data_access_cost (first_store, &store_inside_cost,
1614 &store_outside_cost, &dummy);
1616 dummy.release ();
1618 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1619 aligning the load DR0). */
1620 load_inside_penalty = store_inside_cost;
1621 load_outside_penalty = store_outside_cost;
1622 for (i = 0;
1623 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1624 DR_STMT (first_store))).iterate (i, &dr);
1625 i++)
1626 if (DR_IS_READ (dr))
1628 load_inside_penalty += load_inside_cost;
1629 load_outside_penalty += load_outside_cost;
1631 else
1633 load_inside_penalty += store_inside_cost;
1634 load_outside_penalty += store_outside_cost;
1637 /* Calculate the penalty for leaving DR0 unaligned (by
1638 aligning the FIRST_STORE). */
1639 store_inside_penalty = load_inside_cost;
1640 store_outside_penalty = load_outside_cost;
1641 for (i = 0;
1642 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1643 DR_STMT (dr0))).iterate (i, &dr);
1644 i++)
1645 if (DR_IS_READ (dr))
1647 store_inside_penalty += load_inside_cost;
1648 store_outside_penalty += load_outside_cost;
1650 else
1652 store_inside_penalty += store_inside_cost;
1653 store_outside_penalty += store_outside_cost;
1656 if (load_inside_penalty > store_inside_penalty
1657 || (load_inside_penalty == store_inside_penalty
1658 && load_outside_penalty > store_outside_penalty))
1659 dr0 = first_store;
1662 /* In case there are only loads with different unknown misalignments, use
1663 peeling only if it may help to align other accesses in the loop or
1664 if it may help improving load bandwith when we'd end up using
1665 unaligned loads. */
1666 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1667 if (!first_store
1668 && !STMT_VINFO_SAME_ALIGN_REFS (
1669 vinfo_for_stmt (DR_STMT (dr0))).length ()
1670 && (vect_supportable_dr_alignment (dr0, false)
1671 != dr_unaligned_supported
1672 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1673 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1674 do_peeling = false;
1677 if (do_peeling && !dr0)
1679 /* Peeling is possible, but there is no data access that is not supported
1680 unless aligned. So we try to choose the best possible peeling. */
1682 /* We should get here only if there are drs with known misalignment. */
1683 gcc_assert (!all_misalignments_unknown);
1685 /* Choose the best peeling from the hash table. */
1686 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1687 loop_vinfo, &npeel,
1688 &body_cost_vec);
1689 if (!dr0 || !npeel)
1690 do_peeling = false;
1693 if (do_peeling)
1695 stmt = DR_STMT (dr0);
1696 stmt_info = vinfo_for_stmt (stmt);
1697 vectype = STMT_VINFO_VECTYPE (stmt_info);
1698 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1700 if (known_alignment_for_access_p (dr0))
1702 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1703 size_zero_node) < 0;
1704 if (!npeel)
1706 /* Since it's known at compile time, compute the number of
1707 iterations in the peeled loop (the peeling factor) for use in
1708 updating DR_MISALIGNMENT values. The peeling factor is the
1709 vectorization factor minus the misalignment as an element
1710 count. */
1711 mis = DR_MISALIGNMENT (dr0);
1712 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1713 npeel = ((negative ? mis - nelements : nelements - mis)
1714 & (nelements - 1));
1717 /* For interleaved data access every iteration accesses all the
1718 members of the group, therefore we divide the number of iterations
1719 by the group size. */
1720 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1721 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1722 npeel /= GROUP_SIZE (stmt_info);
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "Try peeling by %d\n", npeel);
1729 /* Ensure that all data refs can be vectorized after the peel. */
1730 FOR_EACH_VEC_ELT (datarefs, i, dr)
1732 int save_misalignment;
1734 if (dr == dr0)
1735 continue;
1737 stmt = DR_STMT (dr);
1738 stmt_info = vinfo_for_stmt (stmt);
1739 /* For interleaving, only the alignment of the first access
1740 matters. */
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1743 continue;
1745 /* Strided accesses perform only component accesses, alignment is
1746 irrelevant for them. */
1747 if (STMT_VINFO_STRIDED_P (stmt_info)
1748 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1749 continue;
1751 save_misalignment = DR_MISALIGNMENT (dr);
1752 vect_update_misalignment_for_peel (dr, dr0, npeel);
1753 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1754 SET_DR_MISALIGNMENT (dr, save_misalignment);
1756 if (!supportable_dr_alignment)
1758 do_peeling = false;
1759 break;
1763 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1765 stat = vect_verify_datarefs_alignment (loop_vinfo);
1766 if (!stat)
1767 do_peeling = false;
1768 else
1770 body_cost_vec.release ();
1771 return stat;
1775 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1776 if (do_peeling)
1778 unsigned max_allowed_peel
1779 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1780 if (max_allowed_peel != (unsigned)-1)
1782 unsigned max_peel = npeel;
1783 if (max_peel == 0)
1785 gimple *dr_stmt = DR_STMT (dr0);
1786 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1787 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1788 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1790 if (max_peel > max_allowed_peel)
1792 do_peeling = false;
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_NOTE, vect_location,
1795 "Disable peeling, max peels reached: %d\n", max_peel);
1800 /* Cost model #2 - if peeling may result in a remaining loop not
1801 iterating enough to be vectorized then do not peel. */
1802 if (do_peeling
1803 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1805 unsigned max_peel
1806 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1807 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1808 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1809 do_peeling = false;
1812 if (do_peeling)
1814 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1815 If the misalignment of DR_i is identical to that of dr0 then set
1816 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1817 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1818 by the peeling factor times the element size of DR_i (MOD the
1819 vectorization factor times the size). Otherwise, the
1820 misalignment of DR_i must be set to unknown. */
1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
1822 if (dr != dr0)
1823 vect_update_misalignment_for_peel (dr, dr0, npeel);
1825 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1826 if (npeel)
1827 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1828 else
1829 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1830 = DR_MISALIGNMENT (dr0);
1831 SET_DR_MISALIGNMENT (dr0, 0);
1832 if (dump_enabled_p ())
1834 dump_printf_loc (MSG_NOTE, vect_location,
1835 "Alignment of access forced using peeling.\n");
1836 dump_printf_loc (MSG_NOTE, vect_location,
1837 "Peeling for alignment will be applied.\n");
1839 /* The inside-loop cost will be accounted for in vectorizable_load
1840 and vectorizable_store correctly with adjusted alignments.
1841 Drop the body_cst_vec on the floor here. */
1842 body_cost_vec.release ();
1844 stat = vect_verify_datarefs_alignment (loop_vinfo);
1845 gcc_assert (stat);
1846 return stat;
1850 body_cost_vec.release ();
1852 /* (2) Versioning to force alignment. */
1854 /* Try versioning if:
1855 1) optimize loop for speed
1856 2) there is at least one unsupported misaligned data ref with an unknown
1857 misalignment, and
1858 3) all misaligned data refs with a known misalignment are supported, and
1859 4) the number of runtime alignment checks is within reason. */
1861 do_versioning =
1862 optimize_loop_nest_for_speed_p (loop)
1863 && (!loop->inner); /* FORNOW */
1865 if (do_versioning)
1867 FOR_EACH_VEC_ELT (datarefs, i, dr)
1869 stmt = DR_STMT (dr);
1870 stmt_info = vinfo_for_stmt (stmt);
1872 /* For interleaving, only the alignment of the first access
1873 matters. */
1874 if (aligned_access_p (dr)
1875 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1876 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1877 continue;
1879 if (STMT_VINFO_STRIDED_P (stmt_info))
1881 /* Strided loads perform only component accesses, alignment is
1882 irrelevant for them. */
1883 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1884 continue;
1885 do_versioning = false;
1886 break;
1889 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1891 if (!supportable_dr_alignment)
1893 gimple *stmt;
1894 int mask;
1895 tree vectype;
1897 if (known_alignment_for_access_p (dr)
1898 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1899 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1901 do_versioning = false;
1902 break;
1905 stmt = DR_STMT (dr);
1906 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1907 gcc_assert (vectype);
1909 /* The rightmost bits of an aligned address must be zeros.
1910 Construct the mask needed for this test. For example,
1911 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1912 mask must be 15 = 0xf. */
1913 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1915 /* FORNOW: use the same mask to test all potentially unaligned
1916 references in the loop. The vectorizer currently supports
1917 a single vector size, see the reference to
1918 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1919 vectorization factor is computed. */
1920 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1921 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1922 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1923 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1924 DR_STMT (dr));
1928 /* Versioning requires at least one misaligned data reference. */
1929 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1930 do_versioning = false;
1931 else if (!do_versioning)
1932 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1935 if (do_versioning)
1937 vec<gimple *> may_misalign_stmts
1938 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1939 gimple *stmt;
1941 /* It can now be assumed that the data references in the statements
1942 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1943 of the loop being vectorized. */
1944 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1947 dr = STMT_VINFO_DATA_REF (stmt_info);
1948 SET_DR_MISALIGNMENT (dr, 0);
1949 if (dump_enabled_p ())
1950 dump_printf_loc (MSG_NOTE, vect_location,
1951 "Alignment of access forced using versioning.\n");
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE, vect_location,
1956 "Versioning for alignment will be applied.\n");
1958 /* Peeling and versioning can't be done together at this time. */
1959 gcc_assert (! (do_peeling && do_versioning));
1961 stat = vect_verify_datarefs_alignment (loop_vinfo);
1962 gcc_assert (stat);
1963 return stat;
1966 /* This point is reached if neither peeling nor versioning is being done. */
1967 gcc_assert (! (do_peeling || do_versioning));
1969 stat = vect_verify_datarefs_alignment (loop_vinfo);
1970 return stat;
1974 /* Function vect_find_same_alignment_drs.
1976 Update group and alignment relations according to the chosen
1977 vectorization factor. */
1979 static void
1980 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1981 loop_vec_info loop_vinfo)
1983 unsigned int i;
1984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1985 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1986 struct data_reference *dra = DDR_A (ddr);
1987 struct data_reference *drb = DDR_B (ddr);
1988 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1989 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1990 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1991 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1992 lambda_vector dist_v;
1993 unsigned int loop_depth;
1995 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1996 return;
1998 if (dra == drb)
1999 return;
2001 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2002 return;
2004 /* Loop-based vectorization and known data dependence. */
2005 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2006 return;
2008 /* Data-dependence analysis reports a distance vector of zero
2009 for data-references that overlap only in the first iteration
2010 but have different sign step (see PR45764).
2011 So as a sanity check require equal DR_STEP. */
2012 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2013 return;
2015 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2016 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2018 int dist = dist_v[loop_depth];
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE, vect_location,
2022 "dependence distance = %d.\n", dist);
2024 /* Same loop iteration. */
2025 if (dist == 0
2026 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2028 /* Two references with distance zero have the same alignment. */
2029 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2030 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2031 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "accesses have the same alignment.\n");
2035 dump_printf (MSG_NOTE,
2036 "dependence distance modulo vf == 0 between ");
2037 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2038 dump_printf (MSG_NOTE, " and ");
2039 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2040 dump_printf (MSG_NOTE, "\n");
2047 /* Function vect_analyze_data_refs_alignment
2049 Analyze the alignment of the data-references in the loop.
2050 Return FALSE if a data reference is found that cannot be vectorized. */
2052 bool
2053 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "=== vect_analyze_data_refs_alignment ===\n");
2059 /* Mark groups of data references with same alignment using
2060 data dependence information. */
2061 vec<ddr_p> ddrs = vinfo->ddrs;
2062 struct data_dependence_relation *ddr;
2063 unsigned int i;
2065 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2066 vect_find_same_alignment_drs (ddr, vinfo);
2068 vec<data_reference_p> datarefs = vinfo->datarefs;
2069 struct data_reference *dr;
2071 FOR_EACH_VEC_ELT (datarefs, i, dr)
2073 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2074 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2075 && !vect_compute_data_ref_alignment (dr))
2077 /* Strided accesses perform only component accesses, misalignment
2078 information is irrelevant for them. */
2079 if (STMT_VINFO_STRIDED_P (stmt_info)
2080 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2081 continue;
2083 if (dump_enabled_p ())
2084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2085 "not vectorized: can't calculate alignment "
2086 "for data ref.\n");
2088 return false;
2092 return true;
2096 /* Analyze alignment of DRs of stmts in NODE. */
2098 static bool
2099 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2101 /* We vectorize from the first scalar stmt in the node unless
2102 the node is permuted in which case we start from the first
2103 element in the group. */
2104 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2105 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2106 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2107 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2109 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2110 if (! vect_compute_data_ref_alignment (dr)
2111 /* For creating the data-ref pointer we need alignment of the
2112 first element anyway. */
2113 || (dr != first_dr
2114 && ! vect_compute_data_ref_alignment (first_dr))
2115 || ! verify_data_ref_alignment (dr))
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2119 "not vectorized: bad data alignment in basic "
2120 "block.\n");
2121 return false;
2124 return true;
2127 /* Function vect_slp_analyze_instance_alignment
2129 Analyze the alignment of the data-references in the SLP instance.
2130 Return FALSE if a data reference is found that cannot be vectorized. */
2132 bool
2133 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2135 if (dump_enabled_p ())
2136 dump_printf_loc (MSG_NOTE, vect_location,
2137 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2139 slp_tree node;
2140 unsigned i;
2141 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2142 if (! vect_slp_analyze_and_verify_node_alignment (node))
2143 return false;
2145 node = SLP_INSTANCE_TREE (instance);
2146 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2147 && ! vect_slp_analyze_and_verify_node_alignment
2148 (SLP_INSTANCE_TREE (instance)))
2149 return false;
2151 return true;
2155 /* Analyze groups of accesses: check that DR belongs to a group of
2156 accesses of legal size, step, etc. Detect gaps, single element
2157 interleaving, and other special cases. Set grouped access info.
2158 Collect groups of strided stores for further use in SLP analysis.
2159 Worker for vect_analyze_group_access. */
2161 static bool
2162 vect_analyze_group_access_1 (struct data_reference *dr)
2164 tree step = DR_STEP (dr);
2165 tree scalar_type = TREE_TYPE (DR_REF (dr));
2166 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2167 gimple *stmt = DR_STMT (dr);
2168 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2169 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2170 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2171 HOST_WIDE_INT dr_step = -1;
2172 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2173 bool slp_impossible = false;
2175 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2176 size of the interleaving group (including gaps). */
2177 if (tree_fits_shwi_p (step))
2179 dr_step = tree_to_shwi (step);
2180 /* Check that STEP is a multiple of type size. Otherwise there is
2181 a non-element-sized gap at the end of the group which we
2182 cannot represent in GROUP_GAP or GROUP_SIZE.
2183 ??? As we can handle non-constant step fine here we should
2184 simply remove uses of GROUP_GAP between the last and first
2185 element and instead rely on DR_STEP. GROUP_SIZE then would
2186 simply not include that gap. */
2187 if ((dr_step % type_size) != 0)
2189 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_NOTE, vect_location,
2192 "Step ");
2193 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2194 dump_printf (MSG_NOTE,
2195 " is not a multiple of the element size for ");
2196 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2197 dump_printf (MSG_NOTE, "\n");
2199 return false;
2201 groupsize = absu_hwi (dr_step) / type_size;
2203 else
2204 groupsize = 0;
2206 /* Not consecutive access is possible only if it is a part of interleaving. */
2207 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2209 /* Check if it this DR is a part of interleaving, and is a single
2210 element of the group that is accessed in the loop. */
2212 /* Gaps are supported only for loads. STEP must be a multiple of the type
2213 size. The size of the group must be a power of 2. */
2214 if (DR_IS_READ (dr)
2215 && (dr_step % type_size) == 0
2216 && groupsize > 0
2217 && exact_log2 (groupsize) != -1)
2219 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2220 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2221 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2224 "Detected single element interleaving ");
2225 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2226 dump_printf (MSG_NOTE, " step ");
2227 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2228 dump_printf (MSG_NOTE, "\n");
2231 return true;
2234 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2237 "not consecutive access ");
2238 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2241 if (bb_vinfo)
2243 /* Mark the statement as unvectorizable. */
2244 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2245 return true;
2248 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2249 STMT_VINFO_STRIDED_P (stmt_info) = true;
2250 return true;
2253 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2255 /* First stmt in the interleaving chain. Check the chain. */
2256 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2257 struct data_reference *data_ref = dr;
2258 unsigned int count = 1;
2259 tree prev_init = DR_INIT (data_ref);
2260 gimple *prev = stmt;
2261 HOST_WIDE_INT diff, gaps = 0;
2263 while (next)
2265 /* Skip same data-refs. In case that two or more stmts share
2266 data-ref (supported only for loads), we vectorize only the first
2267 stmt, and the rest get their vectorized loads from the first
2268 one. */
2269 if (!tree_int_cst_compare (DR_INIT (data_ref),
2270 DR_INIT (STMT_VINFO_DATA_REF (
2271 vinfo_for_stmt (next)))))
2273 if (DR_IS_WRITE (data_ref))
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277 "Two store stmts share the same dr.\n");
2278 return false;
2281 if (dump_enabled_p ())
2282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2283 "Two or more load stmts share the same dr.\n");
2285 /* For load use the same data-ref load. */
2286 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2288 prev = next;
2289 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2290 continue;
2293 prev = next;
2294 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2296 /* All group members have the same STEP by construction. */
2297 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2299 /* Check that the distance between two accesses is equal to the type
2300 size. Otherwise, we have gaps. */
2301 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2302 - TREE_INT_CST_LOW (prev_init)) / type_size;
2303 if (diff != 1)
2305 /* FORNOW: SLP of accesses with gaps is not supported. */
2306 slp_impossible = true;
2307 if (DR_IS_WRITE (data_ref))
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "interleaved store with gaps\n");
2312 return false;
2315 gaps += diff - 1;
2318 last_accessed_element += diff;
2320 /* Store the gap from the previous member of the group. If there is no
2321 gap in the access, GROUP_GAP is always 1. */
2322 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2324 prev_init = DR_INIT (data_ref);
2325 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2326 /* Count the number of data-refs in the chain. */
2327 count++;
2330 if (groupsize == 0)
2331 groupsize = count + gaps;
2333 if (groupsize > UINT_MAX)
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 "group is too large\n");
2338 return false;
2341 /* Check that the size of the interleaving is equal to count for stores,
2342 i.e., that there are no gaps. */
2343 if (groupsize != count
2344 && !DR_IS_READ (dr))
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348 "interleaved store with gaps\n");
2349 return false;
2352 /* If there is a gap after the last load in the group it is the
2353 difference between the groupsize and the last accessed
2354 element.
2355 When there is no gap, this difference should be 0. */
2356 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2358 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2359 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_NOTE, vect_location,
2362 "Detected interleaving ");
2363 if (DR_IS_READ (dr))
2364 dump_printf (MSG_NOTE, "load ");
2365 else
2366 dump_printf (MSG_NOTE, "store ");
2367 dump_printf (MSG_NOTE, "of size %u starting with ",
2368 (unsigned)groupsize);
2369 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2370 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2371 dump_printf_loc (MSG_NOTE, vect_location,
2372 "There is a gap of %u elements after the group\n",
2373 GROUP_GAP (vinfo_for_stmt (stmt)));
2376 /* SLP: create an SLP data structure for every interleaving group of
2377 stores for further analysis in vect_analyse_slp. */
2378 if (DR_IS_WRITE (dr) && !slp_impossible)
2380 if (loop_vinfo)
2381 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2382 if (bb_vinfo)
2383 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2387 return true;
2390 /* Analyze groups of accesses: check that DR belongs to a group of
2391 accesses of legal size, step, etc. Detect gaps, single element
2392 interleaving, and other special cases. Set grouped access info.
2393 Collect groups of strided stores for further use in SLP analysis. */
2395 static bool
2396 vect_analyze_group_access (struct data_reference *dr)
2398 if (!vect_analyze_group_access_1 (dr))
2400 /* Dissolve the group if present. */
2401 gimple *next;
2402 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2403 while (stmt)
2405 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2406 next = GROUP_NEXT_ELEMENT (vinfo);
2407 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2408 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2409 stmt = next;
2411 return false;
2413 return true;
2416 /* Analyze the access pattern of the data-reference DR.
2417 In case of non-consecutive accesses call vect_analyze_group_access() to
2418 analyze groups of accesses. */
2420 static bool
2421 vect_analyze_data_ref_access (struct data_reference *dr)
2423 tree step = DR_STEP (dr);
2424 tree scalar_type = TREE_TYPE (DR_REF (dr));
2425 gimple *stmt = DR_STMT (dr);
2426 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2427 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2428 struct loop *loop = NULL;
2430 if (loop_vinfo)
2431 loop = LOOP_VINFO_LOOP (loop_vinfo);
2433 if (loop_vinfo && !step)
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2437 "bad data-ref access in loop\n");
2438 return false;
2441 /* Allow loads with zero step in inner-loop vectorization. */
2442 if (loop_vinfo && integer_zerop (step))
2444 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2445 if (!nested_in_vect_loop_p (loop, stmt))
2446 return DR_IS_READ (dr);
2447 /* Allow references with zero step for outer loops marked
2448 with pragma omp simd only - it guarantees absence of
2449 loop-carried dependencies between inner loop iterations. */
2450 if (!loop->force_vectorize)
2452 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_NOTE, vect_location,
2454 "zero step in inner loop of nest\n");
2455 return false;
2459 if (loop && nested_in_vect_loop_p (loop, stmt))
2461 /* Interleaved accesses are not yet supported within outer-loop
2462 vectorization for references in the inner-loop. */
2463 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2465 /* For the rest of the analysis we use the outer-loop step. */
2466 step = STMT_VINFO_DR_STEP (stmt_info);
2467 if (integer_zerop (step))
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_NOTE, vect_location,
2471 "zero step in outer loop.\n");
2472 return DR_IS_READ (dr);
2476 /* Consecutive? */
2477 if (TREE_CODE (step) == INTEGER_CST)
2479 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2480 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2481 || (dr_step < 0
2482 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2484 /* Mark that it is not interleaving. */
2485 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2486 return true;
2490 if (loop && nested_in_vect_loop_p (loop, stmt))
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_NOTE, vect_location,
2494 "grouped access in outer loop.\n");
2495 return false;
2499 /* Assume this is a DR handled by non-constant strided load case. */
2500 if (TREE_CODE (step) != INTEGER_CST)
2501 return (STMT_VINFO_STRIDED_P (stmt_info)
2502 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2503 || vect_analyze_group_access (dr)));
2505 /* Not consecutive access - check if it's a part of interleaving group. */
2506 return vect_analyze_group_access (dr);
2511 /* A helper function used in the comparator function to sort data
2512 references. T1 and T2 are two data references to be compared.
2513 The function returns -1, 0, or 1. */
2515 static int
2516 compare_tree (tree t1, tree t2)
2518 int i, cmp;
2519 enum tree_code code;
2520 char tclass;
2522 if (t1 == t2)
2523 return 0;
2524 if (t1 == NULL)
2525 return -1;
2526 if (t2 == NULL)
2527 return 1;
2529 STRIP_NOPS (t1);
2530 STRIP_NOPS (t2);
2532 if (TREE_CODE (t1) != TREE_CODE (t2))
2533 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2535 code = TREE_CODE (t1);
2536 switch (code)
2538 /* For const values, we can just use hash values for comparisons. */
2539 case INTEGER_CST:
2540 case REAL_CST:
2541 case FIXED_CST:
2542 case STRING_CST:
2543 case COMPLEX_CST:
2544 case VECTOR_CST:
2546 hashval_t h1 = iterative_hash_expr (t1, 0);
2547 hashval_t h2 = iterative_hash_expr (t2, 0);
2548 if (h1 != h2)
2549 return h1 < h2 ? -1 : 1;
2550 break;
2553 case SSA_NAME:
2554 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2555 if (cmp != 0)
2556 return cmp;
2558 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2559 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2560 break;
2562 default:
2563 tclass = TREE_CODE_CLASS (code);
2565 /* For var-decl, we could compare their UIDs. */
2566 if (tclass == tcc_declaration)
2568 if (DECL_UID (t1) != DECL_UID (t2))
2569 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2570 break;
2573 /* For expressions with operands, compare their operands recursively. */
2574 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2576 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2577 if (cmp != 0)
2578 return cmp;
2582 return 0;
2586 /* Compare two data-references DRA and DRB to group them into chunks
2587 suitable for grouping. */
2589 static int
2590 dr_group_sort_cmp (const void *dra_, const void *drb_)
2592 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2593 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2594 int cmp;
2596 /* Stabilize sort. */
2597 if (dra == drb)
2598 return 0;
2600 /* Ordering of DRs according to base. */
2601 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2603 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2604 if (cmp != 0)
2605 return cmp;
2608 /* And according to DR_OFFSET. */
2609 if (!dr_equal_offsets_p (dra, drb))
2611 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2612 if (cmp != 0)
2613 return cmp;
2616 /* Put reads before writes. */
2617 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2618 return DR_IS_READ (dra) ? -1 : 1;
2620 /* Then sort after access size. */
2621 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2622 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2624 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2625 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2626 if (cmp != 0)
2627 return cmp;
2630 /* And after step. */
2631 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2633 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2634 if (cmp != 0)
2635 return cmp;
2638 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2639 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2640 if (cmp == 0)
2641 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2642 return cmp;
2645 /* Function vect_analyze_data_ref_accesses.
2647 Analyze the access pattern of all the data references in the loop.
2649 FORNOW: the only access pattern that is considered vectorizable is a
2650 simple step 1 (consecutive) access.
2652 FORNOW: handle only arrays and pointer accesses. */
2654 bool
2655 vect_analyze_data_ref_accesses (vec_info *vinfo)
2657 unsigned int i;
2658 vec<data_reference_p> datarefs = vinfo->datarefs;
2659 struct data_reference *dr;
2661 if (dump_enabled_p ())
2662 dump_printf_loc (MSG_NOTE, vect_location,
2663 "=== vect_analyze_data_ref_accesses ===\n");
2665 if (datarefs.is_empty ())
2666 return true;
2668 /* Sort the array of datarefs to make building the interleaving chains
2669 linear. Don't modify the original vector's order, it is needed for
2670 determining what dependencies are reversed. */
2671 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2672 datarefs_copy.qsort (dr_group_sort_cmp);
2674 /* Build the interleaving chains. */
2675 for (i = 0; i < datarefs_copy.length () - 1;)
2677 data_reference_p dra = datarefs_copy[i];
2678 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2679 stmt_vec_info lastinfo = NULL;
2680 for (i = i + 1; i < datarefs_copy.length (); ++i)
2682 data_reference_p drb = datarefs_copy[i];
2683 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2685 /* ??? Imperfect sorting (non-compatible types, non-modulo
2686 accesses, same accesses) can lead to a group to be artificially
2687 split here as we don't just skip over those. If it really
2688 matters we can push those to a worklist and re-iterate
2689 over them. The we can just skip ahead to the next DR here. */
2691 /* Check that the data-refs have same first location (except init)
2692 and they are both either store or load (not load and store,
2693 not masked loads or stores). */
2694 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2695 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2696 DR_BASE_ADDRESS (drb), 0)
2697 || !dr_equal_offsets_p (dra, drb)
2698 || !gimple_assign_single_p (DR_STMT (dra))
2699 || !gimple_assign_single_p (DR_STMT (drb)))
2700 break;
2702 /* Check that the data-refs have the same constant size. */
2703 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2704 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2705 if (!tree_fits_uhwi_p (sza)
2706 || !tree_fits_uhwi_p (szb)
2707 || !tree_int_cst_equal (sza, szb))
2708 break;
2710 /* Check that the data-refs have the same step. */
2711 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2712 break;
2714 /* Do not place the same access in the interleaving chain twice. */
2715 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2716 break;
2718 /* Check the types are compatible.
2719 ??? We don't distinguish this during sorting. */
2720 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2721 TREE_TYPE (DR_REF (drb))))
2722 break;
2724 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2725 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2726 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2727 gcc_assert (init_a < init_b);
2729 /* If init_b == init_a + the size of the type * k, we have an
2730 interleaving, and DRA is accessed before DRB. */
2731 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2732 if (type_size_a == 0
2733 || (init_b - init_a) % type_size_a != 0)
2734 break;
2736 /* If we have a store, the accesses are adjacent. This splits
2737 groups into chunks we support (we don't support vectorization
2738 of stores with gaps). */
2739 if (!DR_IS_READ (dra)
2740 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2741 (DR_INIT (datarefs_copy[i-1]))
2742 != type_size_a))
2743 break;
2745 /* If the step (if not zero or non-constant) is greater than the
2746 difference between data-refs' inits this splits groups into
2747 suitable sizes. */
2748 if (tree_fits_shwi_p (DR_STEP (dra)))
2750 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2751 if (step != 0 && step <= (init_b - init_a))
2752 break;
2755 if (dump_enabled_p ())
2757 dump_printf_loc (MSG_NOTE, vect_location,
2758 "Detected interleaving ");
2759 if (DR_IS_READ (dra))
2760 dump_printf (MSG_NOTE, "load ");
2761 else
2762 dump_printf (MSG_NOTE, "store ");
2763 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2764 dump_printf (MSG_NOTE, " and ");
2765 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2766 dump_printf (MSG_NOTE, "\n");
2769 /* Link the found element into the group list. */
2770 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2772 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2773 lastinfo = stmtinfo_a;
2775 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2776 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2777 lastinfo = stmtinfo_b;
2781 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2782 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2783 && !vect_analyze_data_ref_access (dr))
2785 if (dump_enabled_p ())
2786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2787 "not vectorized: complicated access pattern.\n");
2789 if (is_a <bb_vec_info> (vinfo))
2791 /* Mark the statement as not vectorizable. */
2792 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2793 continue;
2795 else
2797 datarefs_copy.release ();
2798 return false;
2802 datarefs_copy.release ();
2803 return true;
2807 /* Operator == between two dr_with_seg_len objects.
2809 This equality operator is used to make sure two data refs
2810 are the same one so that we will consider to combine the
2811 aliasing checks of those two pairs of data dependent data
2812 refs. */
2814 static bool
2815 operator == (const dr_with_seg_len& d1,
2816 const dr_with_seg_len& d2)
2818 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2819 DR_BASE_ADDRESS (d2.dr), 0)
2820 && compare_tree (d1.offset, d2.offset) == 0
2821 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2824 /* Function comp_dr_with_seg_len_pair.
2826 Comparison function for sorting objects of dr_with_seg_len_pair_t
2827 so that we can combine aliasing checks in one scan. */
2829 static int
2830 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2832 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2833 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2835 const dr_with_seg_len &p11 = p1->first,
2836 &p12 = p1->second,
2837 &p21 = p2->first,
2838 &p22 = p2->second;
2840 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2841 if a and c have the same basic address snd step, and b and d have the same
2842 address and step. Therefore, if any a&c or b&d don't have the same address
2843 and step, we don't care the order of those two pairs after sorting. */
2844 int comp_res;
2846 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2847 DR_BASE_ADDRESS (p21.dr))) != 0)
2848 return comp_res;
2849 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2850 DR_BASE_ADDRESS (p22.dr))) != 0)
2851 return comp_res;
2852 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2853 return comp_res;
2854 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2855 return comp_res;
2856 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2857 return comp_res;
2858 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2859 return comp_res;
2861 return 0;
2864 /* Function vect_vfa_segment_size.
2866 Create an expression that computes the size of segment
2867 that will be accessed for a data reference. The functions takes into
2868 account that realignment loads may access one more vector.
2870 Input:
2871 DR: The data reference.
2872 LENGTH_FACTOR: segment length to consider.
2874 Return an expression whose value is the size of segment which will be
2875 accessed by DR. */
2877 static tree
2878 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2880 tree segment_length;
2882 if (integer_zerop (DR_STEP (dr)))
2883 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2884 else
2885 segment_length = size_binop (MULT_EXPR,
2886 fold_convert (sizetype, DR_STEP (dr)),
2887 fold_convert (sizetype, length_factor));
2889 if (vect_supportable_dr_alignment (dr, false)
2890 == dr_explicit_realign_optimized)
2892 tree vector_size = TYPE_SIZE_UNIT
2893 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2895 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2897 return segment_length;
2900 /* Function vect_prune_runtime_alias_test_list.
2902 Prune a list of ddrs to be tested at run-time by versioning for alias.
2903 Merge several alias checks into one if possible.
2904 Return FALSE if resulting list of ddrs is longer then allowed by
2905 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2907 bool
2908 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2910 vec<ddr_p> may_alias_ddrs =
2911 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2912 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2913 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2914 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2915 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2917 ddr_p ddr;
2918 unsigned int i;
2919 tree length_factor;
2921 if (dump_enabled_p ())
2922 dump_printf_loc (MSG_NOTE, vect_location,
2923 "=== vect_prune_runtime_alias_test_list ===\n");
2925 if (may_alias_ddrs.is_empty ())
2926 return true;
2928 /* Basically, for each pair of dependent data refs store_ptr_0
2929 and load_ptr_0, we create an expression:
2931 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2932 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2934 for aliasing checks. However, in some cases we can decrease
2935 the number of checks by combining two checks into one. For
2936 example, suppose we have another pair of data refs store_ptr_0
2937 and load_ptr_1, and if the following condition is satisfied:
2939 load_ptr_0 < load_ptr_1 &&
2940 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2942 (this condition means, in each iteration of vectorized loop,
2943 the accessed memory of store_ptr_0 cannot be between the memory
2944 of load_ptr_0 and load_ptr_1.)
2946 we then can use only the following expression to finish the
2947 alising checks between store_ptr_0 & load_ptr_0 and
2948 store_ptr_0 & load_ptr_1:
2950 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2951 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2953 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2954 same basic address. */
2956 comp_alias_ddrs.create (may_alias_ddrs.length ());
2958 /* First, we collect all data ref pairs for aliasing checks. */
2959 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2961 struct data_reference *dr_a, *dr_b;
2962 gimple *dr_group_first_a, *dr_group_first_b;
2963 tree segment_length_a, segment_length_b;
2964 gimple *stmt_a, *stmt_b;
2966 dr_a = DDR_A (ddr);
2967 stmt_a = DR_STMT (DDR_A (ddr));
2968 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2969 if (dr_group_first_a)
2971 stmt_a = dr_group_first_a;
2972 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2975 dr_b = DDR_B (ddr);
2976 stmt_b = DR_STMT (DDR_B (ddr));
2977 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2978 if (dr_group_first_b)
2980 stmt_b = dr_group_first_b;
2981 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2984 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2985 length_factor = scalar_loop_iters;
2986 else
2987 length_factor = size_int (vect_factor);
2988 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2989 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2991 dr_with_seg_len_pair_t dr_with_seg_len_pair
2992 (dr_with_seg_len (dr_a, segment_length_a),
2993 dr_with_seg_len (dr_b, segment_length_b));
2995 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2996 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2998 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3001 /* Second, we sort the collected data ref pairs so that we can scan
3002 them once to combine all possible aliasing checks. */
3003 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3005 /* Third, we scan the sorted dr pairs and check if we can combine
3006 alias checks of two neighbouring dr pairs. */
3007 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3009 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3010 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3011 *dr_b1 = &comp_alias_ddrs[i-1].second,
3012 *dr_a2 = &comp_alias_ddrs[i].first,
3013 *dr_b2 = &comp_alias_ddrs[i].second;
3015 /* Remove duplicate data ref pairs. */
3016 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3018 if (dump_enabled_p ())
3020 dump_printf_loc (MSG_NOTE, vect_location,
3021 "found equal ranges ");
3022 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3023 DR_REF (dr_a1->dr));
3024 dump_printf (MSG_NOTE, ", ");
3025 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3026 DR_REF (dr_b1->dr));
3027 dump_printf (MSG_NOTE, " and ");
3028 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3029 DR_REF (dr_a2->dr));
3030 dump_printf (MSG_NOTE, ", ");
3031 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3032 DR_REF (dr_b2->dr));
3033 dump_printf (MSG_NOTE, "\n");
3036 comp_alias_ddrs.ordered_remove (i--);
3037 continue;
3040 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3042 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3043 and DR_A1 and DR_A2 are two consecutive memrefs. */
3044 if (*dr_a1 == *dr_a2)
3046 std::swap (dr_a1, dr_b1);
3047 std::swap (dr_a2, dr_b2);
3050 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3051 DR_BASE_ADDRESS (dr_a2->dr),
3053 || !tree_fits_shwi_p (dr_a1->offset)
3054 || !tree_fits_shwi_p (dr_a2->offset))
3055 continue;
3057 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
3058 - tree_to_shwi (dr_a1->offset));
3061 /* Now we check if the following condition is satisfied:
3063 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3065 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3066 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3067 have to make a best estimation. We can get the minimum value
3068 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3069 then either of the following two conditions can guarantee the
3070 one above:
3072 1: DIFF <= MIN_SEG_LEN_B
3073 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3077 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
3078 ? tree_to_shwi (dr_b1->seg_len)
3079 : vect_factor);
3081 if (diff <= min_seg_len_b
3082 || (tree_fits_shwi_p (dr_a1->seg_len)
3083 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
3085 if (dump_enabled_p ())
3087 dump_printf_loc (MSG_NOTE, vect_location,
3088 "merging ranges for ");
3089 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3090 DR_REF (dr_a1->dr));
3091 dump_printf (MSG_NOTE, ", ");
3092 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3093 DR_REF (dr_b1->dr));
3094 dump_printf (MSG_NOTE, " and ");
3095 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3096 DR_REF (dr_a2->dr));
3097 dump_printf (MSG_NOTE, ", ");
3098 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3099 DR_REF (dr_b2->dr));
3100 dump_printf (MSG_NOTE, "\n");
3103 dr_a1->seg_len = size_binop (PLUS_EXPR,
3104 dr_a2->seg_len, size_int (diff));
3105 comp_alias_ddrs.ordered_remove (i--);
3110 dump_printf_loc (MSG_NOTE, vect_location,
3111 "improved number of alias checks from %d to %d\n",
3112 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3113 if ((int) comp_alias_ddrs.length () >
3114 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3115 return false;
3117 return true;
3120 /* Check whether a non-affine read or write in stmt is suitable for gather load
3121 or scatter store and if so, return a builtin decl for that operation. */
3123 tree
3124 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3125 tree *offp, int *scalep)
3127 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3128 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3129 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3130 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3131 tree offtype = NULL_TREE;
3132 tree decl, base, off;
3133 machine_mode pmode;
3134 int punsignedp, reversep, pvolatilep = 0;
3136 base = DR_REF (dr);
3137 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3138 see if we can use the def stmt of the address. */
3139 if (is_gimple_call (stmt)
3140 && gimple_call_internal_p (stmt)
3141 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3142 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3143 && TREE_CODE (base) == MEM_REF
3144 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3145 && integer_zerop (TREE_OPERAND (base, 1))
3146 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3148 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3149 if (is_gimple_assign (def_stmt)
3150 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3151 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3154 /* The gather and scatter builtins need address of the form
3155 loop_invariant + vector * {1, 2, 4, 8}
3157 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3158 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3159 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3160 multiplications and additions in it. To get a vector, we need
3161 a single SSA_NAME that will be defined in the loop and will
3162 contain everything that is not loop invariant and that can be
3163 vectorized. The following code attempts to find such a preexistng
3164 SSA_NAME OFF and put the loop invariants into a tree BASE
3165 that can be gimplified before the loop. */
3166 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3167 &punsignedp, &reversep, &pvolatilep, false);
3168 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3170 if (TREE_CODE (base) == MEM_REF)
3172 if (!integer_zerop (TREE_OPERAND (base, 1)))
3174 if (off == NULL_TREE)
3176 offset_int moff = mem_ref_offset (base);
3177 off = wide_int_to_tree (sizetype, moff);
3179 else
3180 off = size_binop (PLUS_EXPR, off,
3181 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3183 base = TREE_OPERAND (base, 0);
3185 else
3186 base = build_fold_addr_expr (base);
3188 if (off == NULL_TREE)
3189 off = size_zero_node;
3191 /* If base is not loop invariant, either off is 0, then we start with just
3192 the constant offset in the loop invariant BASE and continue with base
3193 as OFF, otherwise give up.
3194 We could handle that case by gimplifying the addition of base + off
3195 into some SSA_NAME and use that as off, but for now punt. */
3196 if (!expr_invariant_in_loop_p (loop, base))
3198 if (!integer_zerop (off))
3199 return NULL_TREE;
3200 off = base;
3201 base = size_int (pbitpos / BITS_PER_UNIT);
3203 /* Otherwise put base + constant offset into the loop invariant BASE
3204 and continue with OFF. */
3205 else
3207 base = fold_convert (sizetype, base);
3208 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3211 /* OFF at this point may be either a SSA_NAME or some tree expression
3212 from get_inner_reference. Try to peel off loop invariants from it
3213 into BASE as long as possible. */
3214 STRIP_NOPS (off);
3215 while (offtype == NULL_TREE)
3217 enum tree_code code;
3218 tree op0, op1, add = NULL_TREE;
3220 if (TREE_CODE (off) == SSA_NAME)
3222 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3224 if (expr_invariant_in_loop_p (loop, off))
3225 return NULL_TREE;
3227 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3228 break;
3230 op0 = gimple_assign_rhs1 (def_stmt);
3231 code = gimple_assign_rhs_code (def_stmt);
3232 op1 = gimple_assign_rhs2 (def_stmt);
3234 else
3236 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3237 return NULL_TREE;
3238 code = TREE_CODE (off);
3239 extract_ops_from_tree (off, &code, &op0, &op1);
3241 switch (code)
3243 case POINTER_PLUS_EXPR:
3244 case PLUS_EXPR:
3245 if (expr_invariant_in_loop_p (loop, op0))
3247 add = op0;
3248 off = op1;
3249 do_add:
3250 add = fold_convert (sizetype, add);
3251 if (scale != 1)
3252 add = size_binop (MULT_EXPR, add, size_int (scale));
3253 base = size_binop (PLUS_EXPR, base, add);
3254 continue;
3256 if (expr_invariant_in_loop_p (loop, op1))
3258 add = op1;
3259 off = op0;
3260 goto do_add;
3262 break;
3263 case MINUS_EXPR:
3264 if (expr_invariant_in_loop_p (loop, op1))
3266 add = fold_convert (sizetype, op1);
3267 add = size_binop (MINUS_EXPR, size_zero_node, add);
3268 off = op0;
3269 goto do_add;
3271 break;
3272 case MULT_EXPR:
3273 if (scale == 1 && tree_fits_shwi_p (op1))
3275 scale = tree_to_shwi (op1);
3276 off = op0;
3277 continue;
3279 break;
3280 case SSA_NAME:
3281 off = op0;
3282 continue;
3283 CASE_CONVERT:
3284 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3285 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3286 break;
3287 if (TYPE_PRECISION (TREE_TYPE (op0))
3288 == TYPE_PRECISION (TREE_TYPE (off)))
3290 off = op0;
3291 continue;
3293 if (TYPE_PRECISION (TREE_TYPE (op0))
3294 < TYPE_PRECISION (TREE_TYPE (off)))
3296 off = op0;
3297 offtype = TREE_TYPE (off);
3298 STRIP_NOPS (off);
3299 continue;
3301 break;
3302 default:
3303 break;
3305 break;
3308 /* If at the end OFF still isn't a SSA_NAME or isn't
3309 defined in the loop, punt. */
3310 if (TREE_CODE (off) != SSA_NAME
3311 || expr_invariant_in_loop_p (loop, off))
3312 return NULL_TREE;
3314 if (offtype == NULL_TREE)
3315 offtype = TREE_TYPE (off);
3317 if (DR_IS_READ (dr))
3318 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3319 offtype, scale);
3320 else
3321 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3322 offtype, scale);
3324 if (decl == NULL_TREE)
3325 return NULL_TREE;
3327 if (basep)
3328 *basep = base;
3329 if (offp)
3330 *offp = off;
3331 if (scalep)
3332 *scalep = scale;
3333 return decl;
3336 /* Function vect_analyze_data_refs.
3338 Find all the data references in the loop or basic block.
3340 The general structure of the analysis of data refs in the vectorizer is as
3341 follows:
3342 1- vect_analyze_data_refs(loop/bb): call
3343 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3344 in the loop/bb and their dependences.
3345 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3346 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3347 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3351 bool
3352 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3354 struct loop *loop = NULL;
3355 unsigned int i;
3356 struct data_reference *dr;
3357 tree scalar_type;
3359 if (dump_enabled_p ())
3360 dump_printf_loc (MSG_NOTE, vect_location,
3361 "=== vect_analyze_data_refs ===\n");
3363 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3364 loop = LOOP_VINFO_LOOP (loop_vinfo);
3366 /* Go through the data-refs, check that the analysis succeeded. Update
3367 pointer from stmt_vec_info struct to DR and vectype. */
3369 vec<data_reference_p> datarefs = vinfo->datarefs;
3370 FOR_EACH_VEC_ELT (datarefs, i, dr)
3372 gimple *stmt;
3373 stmt_vec_info stmt_info;
3374 tree base, offset, init;
3375 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3376 bool simd_lane_access = false;
3377 int vf;
3379 again:
3380 if (!dr || !DR_REF (dr))
3382 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3384 "not vectorized: unhandled data-ref\n");
3385 return false;
3388 stmt = DR_STMT (dr);
3389 stmt_info = vinfo_for_stmt (stmt);
3391 /* Discard clobbers from the dataref vector. We will remove
3392 clobber stmts during vectorization. */
3393 if (gimple_clobber_p (stmt))
3395 free_data_ref (dr);
3396 if (i == datarefs.length () - 1)
3398 datarefs.pop ();
3399 break;
3401 datarefs.ordered_remove (i);
3402 dr = datarefs[i];
3403 goto again;
3406 /* Check that analysis of the data-ref succeeded. */
3407 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3408 || !DR_STEP (dr))
3410 bool maybe_gather
3411 = DR_IS_READ (dr)
3412 && !TREE_THIS_VOLATILE (DR_REF (dr))
3413 && targetm.vectorize.builtin_gather != NULL;
3414 bool maybe_scatter
3415 = DR_IS_WRITE (dr)
3416 && !TREE_THIS_VOLATILE (DR_REF (dr))
3417 && targetm.vectorize.builtin_scatter != NULL;
3418 bool maybe_simd_lane_access
3419 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3421 /* If target supports vector gather loads or scatter stores, or if
3422 this might be a SIMD lane access, see if they can't be used. */
3423 if (is_a <loop_vec_info> (vinfo)
3424 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3425 && !nested_in_vect_loop_p (loop, stmt))
3427 struct data_reference *newdr
3428 = create_data_ref (NULL, loop_containing_stmt (stmt),
3429 DR_REF (dr), stmt, maybe_scatter ? false : true);
3430 gcc_assert (newdr != NULL && DR_REF (newdr));
3431 if (DR_BASE_ADDRESS (newdr)
3432 && DR_OFFSET (newdr)
3433 && DR_INIT (newdr)
3434 && DR_STEP (newdr)
3435 && integer_zerop (DR_STEP (newdr)))
3437 if (maybe_simd_lane_access)
3439 tree off = DR_OFFSET (newdr);
3440 STRIP_NOPS (off);
3441 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3442 && TREE_CODE (off) == MULT_EXPR
3443 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3445 tree step = TREE_OPERAND (off, 1);
3446 off = TREE_OPERAND (off, 0);
3447 STRIP_NOPS (off);
3448 if (CONVERT_EXPR_P (off)
3449 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3450 0)))
3451 < TYPE_PRECISION (TREE_TYPE (off)))
3452 off = TREE_OPERAND (off, 0);
3453 if (TREE_CODE (off) == SSA_NAME)
3455 gimple *def = SSA_NAME_DEF_STMT (off);
3456 tree reft = TREE_TYPE (DR_REF (newdr));
3457 if (is_gimple_call (def)
3458 && gimple_call_internal_p (def)
3459 && (gimple_call_internal_fn (def)
3460 == IFN_GOMP_SIMD_LANE))
3462 tree arg = gimple_call_arg (def, 0);
3463 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3464 arg = SSA_NAME_VAR (arg);
3465 if (arg == loop->simduid
3466 /* For now. */
3467 && tree_int_cst_equal
3468 (TYPE_SIZE_UNIT (reft),
3469 step))
3471 DR_OFFSET (newdr) = ssize_int (0);
3472 DR_STEP (newdr) = step;
3473 DR_ALIGNED_TO (newdr)
3474 = size_int (BIGGEST_ALIGNMENT);
3475 dr = newdr;
3476 simd_lane_access = true;
3482 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3484 dr = newdr;
3485 if (maybe_gather)
3486 gatherscatter = GATHER;
3487 else
3488 gatherscatter = SCATTER;
3491 if (gatherscatter == SG_NONE && !simd_lane_access)
3492 free_data_ref (newdr);
3495 if (gatherscatter == SG_NONE && !simd_lane_access)
3497 if (dump_enabled_p ())
3499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3500 "not vectorized: data ref analysis "
3501 "failed ");
3502 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3503 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3506 if (is_a <bb_vec_info> (vinfo))
3507 break;
3509 return false;
3513 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3517 "not vectorized: base addr of dr is a "
3518 "constant\n");
3520 if (is_a <bb_vec_info> (vinfo))
3521 break;
3523 if (gatherscatter != SG_NONE || simd_lane_access)
3524 free_data_ref (dr);
3525 return false;
3528 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3530 if (dump_enabled_p ())
3532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3533 "not vectorized: volatile type ");
3534 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3535 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3538 if (is_a <bb_vec_info> (vinfo))
3539 break;
3541 return false;
3544 if (stmt_can_throw_internal (stmt))
3546 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3549 "not vectorized: statement can throw an "
3550 "exception ");
3551 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3552 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3555 if (is_a <bb_vec_info> (vinfo))
3556 break;
3558 if (gatherscatter != SG_NONE || simd_lane_access)
3559 free_data_ref (dr);
3560 return false;
3563 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3564 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3566 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3569 "not vectorized: statement is bitfield "
3570 "access ");
3571 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3572 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3575 if (is_a <bb_vec_info> (vinfo))
3576 break;
3578 if (gatherscatter != SG_NONE || simd_lane_access)
3579 free_data_ref (dr);
3580 return false;
3583 base = unshare_expr (DR_BASE_ADDRESS (dr));
3584 offset = unshare_expr (DR_OFFSET (dr));
3585 init = unshare_expr (DR_INIT (dr));
3587 if (is_gimple_call (stmt)
3588 && (!gimple_call_internal_p (stmt)
3589 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3590 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3592 if (dump_enabled_p ())
3594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3595 "not vectorized: dr in a call ");
3596 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3597 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3600 if (is_a <bb_vec_info> (vinfo))
3601 break;
3603 if (gatherscatter != SG_NONE || simd_lane_access)
3604 free_data_ref (dr);
3605 return false;
3608 /* Update DR field in stmt_vec_info struct. */
3610 /* If the dataref is in an inner-loop of the loop that is considered for
3611 for vectorization, we also want to analyze the access relative to
3612 the outer-loop (DR contains information only relative to the
3613 inner-most enclosing loop). We do that by building a reference to the
3614 first location accessed by the inner-loop, and analyze it relative to
3615 the outer-loop. */
3616 if (loop && nested_in_vect_loop_p (loop, stmt))
3618 tree outer_step, outer_base, outer_init;
3619 HOST_WIDE_INT pbitsize, pbitpos;
3620 tree poffset;
3621 machine_mode pmode;
3622 int punsignedp, preversep, pvolatilep;
3623 affine_iv base_iv, offset_iv;
3624 tree dinit;
3626 /* Build a reference to the first location accessed by the
3627 inner-loop: *(BASE+INIT). (The first location is actually
3628 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3629 tree inner_base = build_fold_indirect_ref
3630 (fold_build_pointer_plus (base, init));
3632 if (dump_enabled_p ())
3634 dump_printf_loc (MSG_NOTE, vect_location,
3635 "analyze in outer-loop: ");
3636 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3637 dump_printf (MSG_NOTE, "\n");
3640 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3641 &poffset, &pmode, &punsignedp,
3642 &preversep, &pvolatilep, false);
3643 gcc_assert (outer_base != NULL_TREE);
3645 if (pbitpos % BITS_PER_UNIT != 0)
3647 if (dump_enabled_p ())
3648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3649 "failed: bit offset alignment.\n");
3650 return false;
3653 if (preversep)
3655 if (dump_enabled_p ())
3656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3657 "failed: reverse storage order.\n");
3658 return false;
3661 outer_base = build_fold_addr_expr (outer_base);
3662 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3663 &base_iv, false))
3665 if (dump_enabled_p ())
3666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3667 "failed: evolution of base is not affine.\n");
3668 return false;
3671 if (offset)
3673 if (poffset)
3674 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3675 poffset);
3676 else
3677 poffset = offset;
3680 if (!poffset)
3682 offset_iv.base = ssize_int (0);
3683 offset_iv.step = ssize_int (0);
3685 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3686 &offset_iv, false))
3688 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3690 "evolution of offset is not affine.\n");
3691 return false;
3694 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3695 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3696 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3697 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3698 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3700 outer_step = size_binop (PLUS_EXPR,
3701 fold_convert (ssizetype, base_iv.step),
3702 fold_convert (ssizetype, offset_iv.step));
3704 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3705 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3706 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3707 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3708 STMT_VINFO_DR_OFFSET (stmt_info) =
3709 fold_convert (ssizetype, offset_iv.base);
3710 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3711 size_int (highest_pow2_factor (offset_iv.base));
3713 if (dump_enabled_p ())
3715 dump_printf_loc (MSG_NOTE, vect_location,
3716 "\touter base_address: ");
3717 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3718 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3719 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3720 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3721 STMT_VINFO_DR_OFFSET (stmt_info));
3722 dump_printf (MSG_NOTE,
3723 "\n\touter constant offset from base address: ");
3724 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3725 STMT_VINFO_DR_INIT (stmt_info));
3726 dump_printf (MSG_NOTE, "\n\touter step: ");
3727 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3728 STMT_VINFO_DR_STEP (stmt_info));
3729 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3730 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3731 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3732 dump_printf (MSG_NOTE, "\n");
3736 if (STMT_VINFO_DATA_REF (stmt_info))
3738 if (dump_enabled_p ())
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3741 "not vectorized: more than one data ref "
3742 "in stmt: ");
3743 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3744 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3747 if (is_a <bb_vec_info> (vinfo))
3748 break;
3750 if (gatherscatter != SG_NONE || simd_lane_access)
3751 free_data_ref (dr);
3752 return false;
3755 STMT_VINFO_DATA_REF (stmt_info) = dr;
3756 if (simd_lane_access)
3758 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3759 free_data_ref (datarefs[i]);
3760 datarefs[i] = dr;
3763 /* Set vectype for STMT. */
3764 scalar_type = TREE_TYPE (DR_REF (dr));
3765 STMT_VINFO_VECTYPE (stmt_info)
3766 = get_vectype_for_scalar_type (scalar_type);
3767 if (!STMT_VINFO_VECTYPE (stmt_info))
3769 if (dump_enabled_p ())
3771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3772 "not vectorized: no vectype for stmt: ");
3773 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3774 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3775 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3776 scalar_type);
3777 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3780 if (is_a <bb_vec_info> (vinfo))
3782 /* No vector type is fine, the ref can still participate
3783 in dependence analysis, we just can't vectorize it. */
3784 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3785 continue;
3788 if (gatherscatter != SG_NONE || simd_lane_access)
3790 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3791 if (gatherscatter != SG_NONE)
3792 free_data_ref (dr);
3794 return false;
3796 else
3798 if (dump_enabled_p ())
3800 dump_printf_loc (MSG_NOTE, vect_location,
3801 "got vectype for stmt: ");
3802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3803 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3804 STMT_VINFO_VECTYPE (stmt_info));
3805 dump_printf (MSG_NOTE, "\n");
3809 /* Adjust the minimal vectorization factor according to the
3810 vector type. */
3811 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3812 if (vf > *min_vf)
3813 *min_vf = vf;
3815 if (gatherscatter != SG_NONE)
3817 tree off;
3818 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3819 NULL, &off, NULL)
3820 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3822 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3823 free_data_ref (dr);
3824 if (dump_enabled_p ())
3826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3827 (gatherscatter == GATHER) ?
3828 "not vectorized: not suitable for gather "
3829 "load " :
3830 "not vectorized: not suitable for scatter "
3831 "store ");
3832 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3833 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3835 return false;
3838 datarefs[i] = dr;
3839 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3842 else if (is_a <loop_vec_info> (vinfo)
3843 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3845 if (nested_in_vect_loop_p (loop, stmt))
3847 if (dump_enabled_p ())
3849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3850 "not vectorized: not suitable for strided "
3851 "load ");
3852 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3853 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3855 return false;
3857 STMT_VINFO_STRIDED_P (stmt_info) = true;
3861 /* If we stopped analysis at the first dataref we could not analyze
3862 when trying to vectorize a basic-block mark the rest of the datarefs
3863 as not vectorizable and truncate the vector of datarefs. That
3864 avoids spending useless time in analyzing their dependence. */
3865 if (i != datarefs.length ())
3867 gcc_assert (is_a <bb_vec_info> (vinfo));
3868 for (unsigned j = i; j < datarefs.length (); ++j)
3870 data_reference_p dr = datarefs[j];
3871 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3872 free_data_ref (dr);
3874 datarefs.truncate (i);
3877 return true;
3881 /* Function vect_get_new_vect_var.
3883 Returns a name for a new variable. The current naming scheme appends the
3884 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3885 the name of vectorizer generated variables, and appends that to NAME if
3886 provided. */
3888 tree
3889 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3891 const char *prefix;
3892 tree new_vect_var;
3894 switch (var_kind)
3896 case vect_simple_var:
3897 prefix = "vect";
3898 break;
3899 case vect_scalar_var:
3900 prefix = "stmp";
3901 break;
3902 case vect_mask_var:
3903 prefix = "mask";
3904 break;
3905 case vect_pointer_var:
3906 prefix = "vectp";
3907 break;
3908 default:
3909 gcc_unreachable ();
3912 if (name)
3914 char* tmp = concat (prefix, "_", name, NULL);
3915 new_vect_var = create_tmp_reg (type, tmp);
3916 free (tmp);
3918 else
3919 new_vect_var = create_tmp_reg (type, prefix);
3921 return new_vect_var;
3924 /* Like vect_get_new_vect_var but return an SSA name. */
3926 tree
3927 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3929 const char *prefix;
3930 tree new_vect_var;
3932 switch (var_kind)
3934 case vect_simple_var:
3935 prefix = "vect";
3936 break;
3937 case vect_scalar_var:
3938 prefix = "stmp";
3939 break;
3940 case vect_pointer_var:
3941 prefix = "vectp";
3942 break;
3943 default:
3944 gcc_unreachable ();
3947 if (name)
3949 char* tmp = concat (prefix, "_", name, NULL);
3950 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3951 free (tmp);
3953 else
3954 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3956 return new_vect_var;
3959 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3961 static void
3962 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3963 stmt_vec_info stmt_info)
3965 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3966 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3967 int misalign = DR_MISALIGNMENT (dr);
3968 if (misalign == -1)
3969 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3970 else
3971 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3974 /* Function vect_create_addr_base_for_vector_ref.
3976 Create an expression that computes the address of the first memory location
3977 that will be accessed for a data reference.
3979 Input:
3980 STMT: The statement containing the data reference.
3981 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3982 OFFSET: Optional. If supplied, it is be added to the initial address.
3983 LOOP: Specify relative to which loop-nest should the address be computed.
3984 For example, when the dataref is in an inner-loop nested in an
3985 outer-loop that is now being vectorized, LOOP can be either the
3986 outer-loop, or the inner-loop. The first memory location accessed
3987 by the following dataref ('in' points to short):
3989 for (i=0; i<N; i++)
3990 for (j=0; j<M; j++)
3991 s += in[i+j]
3993 is as follows:
3994 if LOOP=i_loop: &in (relative to i_loop)
3995 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3996 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3997 initial address. Unlike OFFSET, which is number of elements to
3998 be added, BYTE_OFFSET is measured in bytes.
4000 Output:
4001 1. Return an SSA_NAME whose value is the address of the memory location of
4002 the first vector of the data reference.
4003 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4004 these statement(s) which define the returned SSA_NAME.
4006 FORNOW: We are only handling array accesses with step 1. */
4008 tree
4009 vect_create_addr_base_for_vector_ref (gimple *stmt,
4010 gimple_seq *new_stmt_list,
4011 tree offset,
4012 struct loop *loop,
4013 tree byte_offset)
4015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4016 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4017 tree data_ref_base;
4018 const char *base_name;
4019 tree addr_base;
4020 tree dest;
4021 gimple_seq seq = NULL;
4022 tree base_offset;
4023 tree init;
4024 tree vect_ptr_type;
4025 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4026 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4028 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4030 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4032 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4034 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4035 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4036 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4038 else
4040 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4041 base_offset = unshare_expr (DR_OFFSET (dr));
4042 init = unshare_expr (DR_INIT (dr));
4045 if (loop_vinfo)
4046 base_name = get_name (data_ref_base);
4047 else
4049 base_offset = ssize_int (0);
4050 init = ssize_int (0);
4051 base_name = get_name (DR_REF (dr));
4054 /* Create base_offset */
4055 base_offset = size_binop (PLUS_EXPR,
4056 fold_convert (sizetype, base_offset),
4057 fold_convert (sizetype, init));
4059 if (offset)
4061 offset = fold_build2 (MULT_EXPR, sizetype,
4062 fold_convert (sizetype, offset), step);
4063 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4064 base_offset, offset);
4066 if (byte_offset)
4068 byte_offset = fold_convert (sizetype, byte_offset);
4069 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4070 base_offset, byte_offset);
4073 /* base + base_offset */
4074 if (loop_vinfo)
4075 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4076 else
4078 addr_base = build1 (ADDR_EXPR,
4079 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4080 unshare_expr (DR_REF (dr)));
4083 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4084 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4085 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4086 gimple_seq_add_seq (new_stmt_list, seq);
4088 if (DR_PTR_INFO (dr)
4089 && TREE_CODE (addr_base) == SSA_NAME
4090 && !SSA_NAME_PTR_INFO (addr_base))
4092 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4093 if (offset || byte_offset)
4094 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4097 if (dump_enabled_p ())
4099 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4100 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4101 dump_printf (MSG_NOTE, "\n");
4104 return addr_base;
4108 /* Function vect_create_data_ref_ptr.
4110 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4111 location accessed in the loop by STMT, along with the def-use update
4112 chain to appropriately advance the pointer through the loop iterations.
4113 Also set aliasing information for the pointer. This pointer is used by
4114 the callers to this function to create a memory reference expression for
4115 vector load/store access.
4117 Input:
4118 1. STMT: a stmt that references memory. Expected to be of the form
4119 GIMPLE_ASSIGN <name, data-ref> or
4120 GIMPLE_ASSIGN <data-ref, name>.
4121 2. AGGR_TYPE: the type of the reference, which should be either a vector
4122 or an array.
4123 3. AT_LOOP: the loop where the vector memref is to be created.
4124 4. OFFSET (optional): an offset to be added to the initial address accessed
4125 by the data-ref in STMT.
4126 5. BSI: location where the new stmts are to be placed if there is no loop
4127 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4128 pointing to the initial address.
4129 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4130 to the initial address accessed by the data-ref in STMT. This is
4131 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4132 in bytes.
4134 Output:
4135 1. Declare a new ptr to vector_type, and have it point to the base of the
4136 data reference (initial addressed accessed by the data reference).
4137 For example, for vector of type V8HI, the following code is generated:
4139 v8hi *ap;
4140 ap = (v8hi *)initial_address;
4142 if OFFSET is not supplied:
4143 initial_address = &a[init];
4144 if OFFSET is supplied:
4145 initial_address = &a[init + OFFSET];
4146 if BYTE_OFFSET is supplied:
4147 initial_address = &a[init] + BYTE_OFFSET;
4149 Return the initial_address in INITIAL_ADDRESS.
4151 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4152 update the pointer in each iteration of the loop.
4154 Return the increment stmt that updates the pointer in PTR_INCR.
4156 3. Set INV_P to true if the access pattern of the data reference in the
4157 vectorized loop is invariant. Set it to false otherwise.
4159 4. Return the pointer. */
4161 tree
4162 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4163 tree offset, tree *initial_address,
4164 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4165 bool only_init, bool *inv_p, tree byte_offset)
4167 const char *base_name;
4168 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4169 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4170 struct loop *loop = NULL;
4171 bool nested_in_vect_loop = false;
4172 struct loop *containing_loop = NULL;
4173 tree aggr_ptr_type;
4174 tree aggr_ptr;
4175 tree new_temp;
4176 gimple_seq new_stmt_list = NULL;
4177 edge pe = NULL;
4178 basic_block new_bb;
4179 tree aggr_ptr_init;
4180 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4181 tree aptr;
4182 gimple_stmt_iterator incr_gsi;
4183 bool insert_after;
4184 tree indx_before_incr, indx_after_incr;
4185 gimple *incr;
4186 tree step;
4187 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4189 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4190 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4192 if (loop_vinfo)
4194 loop = LOOP_VINFO_LOOP (loop_vinfo);
4195 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4196 containing_loop = (gimple_bb (stmt))->loop_father;
4197 pe = loop_preheader_edge (loop);
4199 else
4201 gcc_assert (bb_vinfo);
4202 only_init = true;
4203 *ptr_incr = NULL;
4206 /* Check the step (evolution) of the load in LOOP, and record
4207 whether it's invariant. */
4208 if (nested_in_vect_loop)
4209 step = STMT_VINFO_DR_STEP (stmt_info);
4210 else
4211 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4213 if (integer_zerop (step))
4214 *inv_p = true;
4215 else
4216 *inv_p = false;
4218 /* Create an expression for the first address accessed by this load
4219 in LOOP. */
4220 base_name = get_name (DR_BASE_ADDRESS (dr));
4222 if (dump_enabled_p ())
4224 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4225 dump_printf_loc (MSG_NOTE, vect_location,
4226 "create %s-pointer variable to type: ",
4227 get_tree_code_name (TREE_CODE (aggr_type)));
4228 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4229 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4230 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4231 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4232 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4233 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4234 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4235 else
4236 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4237 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4238 dump_printf (MSG_NOTE, "\n");
4241 /* (1) Create the new aggregate-pointer variable.
4242 Vector and array types inherit the alias set of their component
4243 type by default so we need to use a ref-all pointer if the data
4244 reference does not conflict with the created aggregated data
4245 reference because it is not addressable. */
4246 bool need_ref_all = false;
4247 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4248 get_alias_set (DR_REF (dr))))
4249 need_ref_all = true;
4250 /* Likewise for any of the data references in the stmt group. */
4251 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4253 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4256 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4257 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4258 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4259 get_alias_set (DR_REF (sdr))))
4261 need_ref_all = true;
4262 break;
4264 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4266 while (orig_stmt);
4268 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4269 need_ref_all);
4270 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4273 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4274 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4275 def-use update cycles for the pointer: one relative to the outer-loop
4276 (LOOP), which is what steps (3) and (4) below do. The other is relative
4277 to the inner-loop (which is the inner-most loop containing the dataref),
4278 and this is done be step (5) below.
4280 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4281 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4282 redundant. Steps (3),(4) create the following:
4284 vp0 = &base_addr;
4285 LOOP: vp1 = phi(vp0,vp2)
4288 vp2 = vp1 + step
4289 goto LOOP
4291 If there is an inner-loop nested in loop, then step (5) will also be
4292 applied, and an additional update in the inner-loop will be created:
4294 vp0 = &base_addr;
4295 LOOP: vp1 = phi(vp0,vp2)
4297 inner: vp3 = phi(vp1,vp4)
4298 vp4 = vp3 + inner_step
4299 if () goto inner
4301 vp2 = vp1 + step
4302 if () goto LOOP */
4304 /* (2) Calculate the initial address of the aggregate-pointer, and set
4305 the aggregate-pointer to point to it before the loop. */
4307 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4309 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4310 offset, loop, byte_offset);
4311 if (new_stmt_list)
4313 if (pe)
4315 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4316 gcc_assert (!new_bb);
4318 else
4319 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4322 *initial_address = new_temp;
4323 aggr_ptr_init = new_temp;
4325 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4326 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4327 inner-loop nested in LOOP (during outer-loop vectorization). */
4329 /* No update in loop is required. */
4330 if (only_init && (!loop_vinfo || at_loop == loop))
4331 aptr = aggr_ptr_init;
4332 else
4334 /* The step of the aggregate pointer is the type size. */
4335 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4336 /* One exception to the above is when the scalar step of the load in
4337 LOOP is zero. In this case the step here is also zero. */
4338 if (*inv_p)
4339 iv_step = size_zero_node;
4340 else if (tree_int_cst_sgn (step) == -1)
4341 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4343 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4345 create_iv (aggr_ptr_init,
4346 fold_convert (aggr_ptr_type, iv_step),
4347 aggr_ptr, loop, &incr_gsi, insert_after,
4348 &indx_before_incr, &indx_after_incr);
4349 incr = gsi_stmt (incr_gsi);
4350 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4352 /* Copy the points-to information if it exists. */
4353 if (DR_PTR_INFO (dr))
4355 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4356 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4358 if (ptr_incr)
4359 *ptr_incr = incr;
4361 aptr = indx_before_incr;
4364 if (!nested_in_vect_loop || only_init)
4365 return aptr;
4368 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4369 nested in LOOP, if exists. */
4371 gcc_assert (nested_in_vect_loop);
4372 if (!only_init)
4374 standard_iv_increment_position (containing_loop, &incr_gsi,
4375 &insert_after);
4376 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4377 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4378 &indx_after_incr);
4379 incr = gsi_stmt (incr_gsi);
4380 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4382 /* Copy the points-to information if it exists. */
4383 if (DR_PTR_INFO (dr))
4385 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4386 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4388 if (ptr_incr)
4389 *ptr_incr = incr;
4391 return indx_before_incr;
4393 else
4394 gcc_unreachable ();
4398 /* Function bump_vector_ptr
4400 Increment a pointer (to a vector type) by vector-size. If requested,
4401 i.e. if PTR-INCR is given, then also connect the new increment stmt
4402 to the existing def-use update-chain of the pointer, by modifying
4403 the PTR_INCR as illustrated below:
4405 The pointer def-use update-chain before this function:
4406 DATAREF_PTR = phi (p_0, p_2)
4407 ....
4408 PTR_INCR: p_2 = DATAREF_PTR + step
4410 The pointer def-use update-chain after this function:
4411 DATAREF_PTR = phi (p_0, p_2)
4412 ....
4413 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4414 ....
4415 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4417 Input:
4418 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4419 in the loop.
4420 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4421 the loop. The increment amount across iterations is expected
4422 to be vector_size.
4423 BSI - location where the new update stmt is to be placed.
4424 STMT - the original scalar memory-access stmt that is being vectorized.
4425 BUMP - optional. The offset by which to bump the pointer. If not given,
4426 the offset is assumed to be vector_size.
4428 Output: Return NEW_DATAREF_PTR as illustrated above.
4432 tree
4433 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4434 gimple *stmt, tree bump)
4436 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4437 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4438 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4439 tree update = TYPE_SIZE_UNIT (vectype);
4440 gassign *incr_stmt;
4441 ssa_op_iter iter;
4442 use_operand_p use_p;
4443 tree new_dataref_ptr;
4445 if (bump)
4446 update = bump;
4448 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4449 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4450 else
4451 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4452 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4453 dataref_ptr, update);
4454 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4456 /* Copy the points-to information if it exists. */
4457 if (DR_PTR_INFO (dr))
4459 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4460 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4463 if (!ptr_incr)
4464 return new_dataref_ptr;
4466 /* Update the vector-pointer's cross-iteration increment. */
4467 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4469 tree use = USE_FROM_PTR (use_p);
4471 if (use == dataref_ptr)
4472 SET_USE (use_p, new_dataref_ptr);
4473 else
4474 gcc_assert (tree_int_cst_compare (use, update) == 0);
4477 return new_dataref_ptr;
4481 /* Function vect_create_destination_var.
4483 Create a new temporary of type VECTYPE. */
4485 tree
4486 vect_create_destination_var (tree scalar_dest, tree vectype)
4488 tree vec_dest;
4489 const char *name;
4490 char *new_name;
4491 tree type;
4492 enum vect_var_kind kind;
4494 kind = vectype
4495 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4496 ? vect_mask_var
4497 : vect_simple_var
4498 : vect_scalar_var;
4499 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4501 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4503 name = get_name (scalar_dest);
4504 if (name)
4505 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4506 else
4507 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4508 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4509 free (new_name);
4511 return vec_dest;
4514 /* Function vect_grouped_store_supported.
4516 Returns TRUE if interleave high and interleave low permutations
4517 are supported, and FALSE otherwise. */
4519 bool
4520 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4522 machine_mode mode = TYPE_MODE (vectype);
4524 /* vect_permute_store_chain requires the group size to be equal to 3 or
4525 be a power of two. */
4526 if (count != 3 && exact_log2 (count) == -1)
4528 if (dump_enabled_p ())
4529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4530 "the size of the group of accesses"
4531 " is not a power of 2 or not eqaul to 3\n");
4532 return false;
4535 /* Check that the permutation is supported. */
4536 if (VECTOR_MODE_P (mode))
4538 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4539 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4541 if (count == 3)
4543 unsigned int j0 = 0, j1 = 0, j2 = 0;
4544 unsigned int i, j;
4546 for (j = 0; j < 3; j++)
4548 int nelt0 = ((3 - j) * nelt) % 3;
4549 int nelt1 = ((3 - j) * nelt + 1) % 3;
4550 int nelt2 = ((3 - j) * nelt + 2) % 3;
4551 for (i = 0; i < nelt; i++)
4553 if (3 * i + nelt0 < nelt)
4554 sel[3 * i + nelt0] = j0++;
4555 if (3 * i + nelt1 < nelt)
4556 sel[3 * i + nelt1] = nelt + j1++;
4557 if (3 * i + nelt2 < nelt)
4558 sel[3 * i + nelt2] = 0;
4560 if (!can_vec_perm_p (mode, false, sel))
4562 if (dump_enabled_p ())
4563 dump_printf (MSG_MISSED_OPTIMIZATION,
4564 "permutaion op not supported by target.\n");
4565 return false;
4568 for (i = 0; i < nelt; i++)
4570 if (3 * i + nelt0 < nelt)
4571 sel[3 * i + nelt0] = 3 * i + nelt0;
4572 if (3 * i + nelt1 < nelt)
4573 sel[3 * i + nelt1] = 3 * i + nelt1;
4574 if (3 * i + nelt2 < nelt)
4575 sel[3 * i + nelt2] = nelt + j2++;
4577 if (!can_vec_perm_p (mode, false, sel))
4579 if (dump_enabled_p ())
4580 dump_printf (MSG_MISSED_OPTIMIZATION,
4581 "permutaion op not supported by target.\n");
4582 return false;
4585 return true;
4587 else
4589 /* If length is not equal to 3 then only power of 2 is supported. */
4590 gcc_assert (exact_log2 (count) != -1);
4592 for (i = 0; i < nelt / 2; i++)
4594 sel[i * 2] = i;
4595 sel[i * 2 + 1] = i + nelt;
4597 if (can_vec_perm_p (mode, false, sel))
4599 for (i = 0; i < nelt; i++)
4600 sel[i] += nelt / 2;
4601 if (can_vec_perm_p (mode, false, sel))
4602 return true;
4607 if (dump_enabled_p ())
4608 dump_printf (MSG_MISSED_OPTIMIZATION,
4609 "permutaion op not supported by target.\n");
4610 return false;
4614 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4615 type VECTYPE. */
4617 bool
4618 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4620 return vect_lanes_optab_supported_p ("vec_store_lanes",
4621 vec_store_lanes_optab,
4622 vectype, count);
4626 /* Function vect_permute_store_chain.
4628 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4629 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4630 the data correctly for the stores. Return the final references for stores
4631 in RESULT_CHAIN.
4633 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4634 The input is 4 vectors each containing 8 elements. We assign a number to
4635 each element, the input sequence is:
4637 1st vec: 0 1 2 3 4 5 6 7
4638 2nd vec: 8 9 10 11 12 13 14 15
4639 3rd vec: 16 17 18 19 20 21 22 23
4640 4th vec: 24 25 26 27 28 29 30 31
4642 The output sequence should be:
4644 1st vec: 0 8 16 24 1 9 17 25
4645 2nd vec: 2 10 18 26 3 11 19 27
4646 3rd vec: 4 12 20 28 5 13 21 30
4647 4th vec: 6 14 22 30 7 15 23 31
4649 i.e., we interleave the contents of the four vectors in their order.
4651 We use interleave_high/low instructions to create such output. The input of
4652 each interleave_high/low operation is two vectors:
4653 1st vec 2nd vec
4654 0 1 2 3 4 5 6 7
4655 the even elements of the result vector are obtained left-to-right from the
4656 high/low elements of the first vector. The odd elements of the result are
4657 obtained left-to-right from the high/low elements of the second vector.
4658 The output of interleave_high will be: 0 4 1 5
4659 and of interleave_low: 2 6 3 7
4662 The permutation is done in log LENGTH stages. In each stage interleave_high
4663 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4664 where the first argument is taken from the first half of DR_CHAIN and the
4665 second argument from it's second half.
4666 In our example,
4668 I1: interleave_high (1st vec, 3rd vec)
4669 I2: interleave_low (1st vec, 3rd vec)
4670 I3: interleave_high (2nd vec, 4th vec)
4671 I4: interleave_low (2nd vec, 4th vec)
4673 The output for the first stage is:
4675 I1: 0 16 1 17 2 18 3 19
4676 I2: 4 20 5 21 6 22 7 23
4677 I3: 8 24 9 25 10 26 11 27
4678 I4: 12 28 13 29 14 30 15 31
4680 The output of the second stage, i.e. the final result is:
4682 I1: 0 8 16 24 1 9 17 25
4683 I2: 2 10 18 26 3 11 19 27
4684 I3: 4 12 20 28 5 13 21 30
4685 I4: 6 14 22 30 7 15 23 31. */
4687 void
4688 vect_permute_store_chain (vec<tree> dr_chain,
4689 unsigned int length,
4690 gimple *stmt,
4691 gimple_stmt_iterator *gsi,
4692 vec<tree> *result_chain)
4694 tree vect1, vect2, high, low;
4695 gimple *perm_stmt;
4696 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4697 tree perm_mask_low, perm_mask_high;
4698 tree data_ref;
4699 tree perm3_mask_low, perm3_mask_high;
4700 unsigned int i, n, log_length = exact_log2 (length);
4701 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4702 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4704 result_chain->quick_grow (length);
4705 memcpy (result_chain->address (), dr_chain.address (),
4706 length * sizeof (tree));
4708 if (length == 3)
4710 unsigned int j0 = 0, j1 = 0, j2 = 0;
4712 for (j = 0; j < 3; j++)
4714 int nelt0 = ((3 - j) * nelt) % 3;
4715 int nelt1 = ((3 - j) * nelt + 1) % 3;
4716 int nelt2 = ((3 - j) * nelt + 2) % 3;
4718 for (i = 0; i < nelt; i++)
4720 if (3 * i + nelt0 < nelt)
4721 sel[3 * i + nelt0] = j0++;
4722 if (3 * i + nelt1 < nelt)
4723 sel[3 * i + nelt1] = nelt + j1++;
4724 if (3 * i + nelt2 < nelt)
4725 sel[3 * i + nelt2] = 0;
4727 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4729 for (i = 0; i < nelt; i++)
4731 if (3 * i + nelt0 < nelt)
4732 sel[3 * i + nelt0] = 3 * i + nelt0;
4733 if (3 * i + nelt1 < nelt)
4734 sel[3 * i + nelt1] = 3 * i + nelt1;
4735 if (3 * i + nelt2 < nelt)
4736 sel[3 * i + nelt2] = nelt + j2++;
4738 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4740 vect1 = dr_chain[0];
4741 vect2 = dr_chain[1];
4743 /* Create interleaving stmt:
4744 low = VEC_PERM_EXPR <vect1, vect2,
4745 {j, nelt, *, j + 1, nelt + j + 1, *,
4746 j + 2, nelt + j + 2, *, ...}> */
4747 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4748 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4749 vect2, perm3_mask_low);
4750 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4752 vect1 = data_ref;
4753 vect2 = dr_chain[2];
4754 /* Create interleaving stmt:
4755 low = VEC_PERM_EXPR <vect1, vect2,
4756 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4757 6, 7, nelt + j + 2, ...}> */
4758 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4759 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4760 vect2, perm3_mask_high);
4761 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4762 (*result_chain)[j] = data_ref;
4765 else
4767 /* If length is not equal to 3 then only power of 2 is supported. */
4768 gcc_assert (exact_log2 (length) != -1);
4770 for (i = 0, n = nelt / 2; i < n; i++)
4772 sel[i * 2] = i;
4773 sel[i * 2 + 1] = i + nelt;
4775 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4777 for (i = 0; i < nelt; i++)
4778 sel[i] += nelt / 2;
4779 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4781 for (i = 0, n = log_length; i < n; i++)
4783 for (j = 0; j < length/2; j++)
4785 vect1 = dr_chain[j];
4786 vect2 = dr_chain[j+length/2];
4788 /* Create interleaving stmt:
4789 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4790 ...}> */
4791 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4792 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4793 vect2, perm_mask_high);
4794 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4795 (*result_chain)[2*j] = high;
4797 /* Create interleaving stmt:
4798 low = VEC_PERM_EXPR <vect1, vect2,
4799 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4800 ...}> */
4801 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4802 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4803 vect2, perm_mask_low);
4804 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4805 (*result_chain)[2*j+1] = low;
4807 memcpy (dr_chain.address (), result_chain->address (),
4808 length * sizeof (tree));
4813 /* Function vect_setup_realignment
4815 This function is called when vectorizing an unaligned load using
4816 the dr_explicit_realign[_optimized] scheme.
4817 This function generates the following code at the loop prolog:
4819 p = initial_addr;
4820 x msq_init = *(floor(p)); # prolog load
4821 realignment_token = call target_builtin;
4822 loop:
4823 x msq = phi (msq_init, ---)
4825 The stmts marked with x are generated only for the case of
4826 dr_explicit_realign_optimized.
4828 The code above sets up a new (vector) pointer, pointing to the first
4829 location accessed by STMT, and a "floor-aligned" load using that pointer.
4830 It also generates code to compute the "realignment-token" (if the relevant
4831 target hook was defined), and creates a phi-node at the loop-header bb
4832 whose arguments are the result of the prolog-load (created by this
4833 function) and the result of a load that takes place in the loop (to be
4834 created by the caller to this function).
4836 For the case of dr_explicit_realign_optimized:
4837 The caller to this function uses the phi-result (msq) to create the
4838 realignment code inside the loop, and sets up the missing phi argument,
4839 as follows:
4840 loop:
4841 msq = phi (msq_init, lsq)
4842 lsq = *(floor(p')); # load in loop
4843 result = realign_load (msq, lsq, realignment_token);
4845 For the case of dr_explicit_realign:
4846 loop:
4847 msq = *(floor(p)); # load in loop
4848 p' = p + (VS-1);
4849 lsq = *(floor(p')); # load in loop
4850 result = realign_load (msq, lsq, realignment_token);
4852 Input:
4853 STMT - (scalar) load stmt to be vectorized. This load accesses
4854 a memory location that may be unaligned.
4855 BSI - place where new code is to be inserted.
4856 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4857 is used.
4859 Output:
4860 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4861 target hook, if defined.
4862 Return value - the result of the loop-header phi node. */
4864 tree
4865 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4866 tree *realignment_token,
4867 enum dr_alignment_support alignment_support_scheme,
4868 tree init_addr,
4869 struct loop **at_loop)
4871 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4872 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4873 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4874 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4875 struct loop *loop = NULL;
4876 edge pe = NULL;
4877 tree scalar_dest = gimple_assign_lhs (stmt);
4878 tree vec_dest;
4879 gimple *inc;
4880 tree ptr;
4881 tree data_ref;
4882 basic_block new_bb;
4883 tree msq_init = NULL_TREE;
4884 tree new_temp;
4885 gphi *phi_stmt;
4886 tree msq = NULL_TREE;
4887 gimple_seq stmts = NULL;
4888 bool inv_p;
4889 bool compute_in_loop = false;
4890 bool nested_in_vect_loop = false;
4891 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4892 struct loop *loop_for_initial_load = NULL;
4894 if (loop_vinfo)
4896 loop = LOOP_VINFO_LOOP (loop_vinfo);
4897 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4900 gcc_assert (alignment_support_scheme == dr_explicit_realign
4901 || alignment_support_scheme == dr_explicit_realign_optimized);
4903 /* We need to generate three things:
4904 1. the misalignment computation
4905 2. the extra vector load (for the optimized realignment scheme).
4906 3. the phi node for the two vectors from which the realignment is
4907 done (for the optimized realignment scheme). */
4909 /* 1. Determine where to generate the misalignment computation.
4911 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4912 calculation will be generated by this function, outside the loop (in the
4913 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4914 caller, inside the loop.
4916 Background: If the misalignment remains fixed throughout the iterations of
4917 the loop, then both realignment schemes are applicable, and also the
4918 misalignment computation can be done outside LOOP. This is because we are
4919 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4920 are a multiple of VS (the Vector Size), and therefore the misalignment in
4921 different vectorized LOOP iterations is always the same.
4922 The problem arises only if the memory access is in an inner-loop nested
4923 inside LOOP, which is now being vectorized using outer-loop vectorization.
4924 This is the only case when the misalignment of the memory access may not
4925 remain fixed throughout the iterations of the inner-loop (as explained in
4926 detail in vect_supportable_dr_alignment). In this case, not only is the
4927 optimized realignment scheme not applicable, but also the misalignment
4928 computation (and generation of the realignment token that is passed to
4929 REALIGN_LOAD) have to be done inside the loop.
4931 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4932 or not, which in turn determines if the misalignment is computed inside
4933 the inner-loop, or outside LOOP. */
4935 if (init_addr != NULL_TREE || !loop_vinfo)
4937 compute_in_loop = true;
4938 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4942 /* 2. Determine where to generate the extra vector load.
4944 For the optimized realignment scheme, instead of generating two vector
4945 loads in each iteration, we generate a single extra vector load in the
4946 preheader of the loop, and in each iteration reuse the result of the
4947 vector load from the previous iteration. In case the memory access is in
4948 an inner-loop nested inside LOOP, which is now being vectorized using
4949 outer-loop vectorization, we need to determine whether this initial vector
4950 load should be generated at the preheader of the inner-loop, or can be
4951 generated at the preheader of LOOP. If the memory access has no evolution
4952 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4953 to be generated inside LOOP (in the preheader of the inner-loop). */
4955 if (nested_in_vect_loop)
4957 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4958 bool invariant_in_outerloop =
4959 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4960 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4962 else
4963 loop_for_initial_load = loop;
4964 if (at_loop)
4965 *at_loop = loop_for_initial_load;
4967 if (loop_for_initial_load)
4968 pe = loop_preheader_edge (loop_for_initial_load);
4970 /* 3. For the case of the optimized realignment, create the first vector
4971 load at the loop preheader. */
4973 if (alignment_support_scheme == dr_explicit_realign_optimized)
4975 /* Create msq_init = *(floor(p1)) in the loop preheader */
4976 gassign *new_stmt;
4978 gcc_assert (!compute_in_loop);
4979 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4980 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4981 NULL_TREE, &init_addr, NULL, &inc,
4982 true, &inv_p);
4983 if (TREE_CODE (ptr) == SSA_NAME)
4984 new_temp = copy_ssa_name (ptr);
4985 else
4986 new_temp = make_ssa_name (TREE_TYPE (ptr));
4987 new_stmt = gimple_build_assign
4988 (new_temp, BIT_AND_EXPR, ptr,
4989 build_int_cst (TREE_TYPE (ptr),
4990 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4991 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4992 gcc_assert (!new_bb);
4993 data_ref
4994 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4995 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4996 new_stmt = gimple_build_assign (vec_dest, data_ref);
4997 new_temp = make_ssa_name (vec_dest, new_stmt);
4998 gimple_assign_set_lhs (new_stmt, new_temp);
4999 if (pe)
5001 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5002 gcc_assert (!new_bb);
5004 else
5005 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5007 msq_init = gimple_assign_lhs (new_stmt);
5010 /* 4. Create realignment token using a target builtin, if available.
5011 It is done either inside the containing loop, or before LOOP (as
5012 determined above). */
5014 if (targetm.vectorize.builtin_mask_for_load)
5016 gcall *new_stmt;
5017 tree builtin_decl;
5019 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5020 if (!init_addr)
5022 /* Generate the INIT_ADDR computation outside LOOP. */
5023 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5024 NULL_TREE, loop);
5025 if (loop)
5027 pe = loop_preheader_edge (loop);
5028 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5029 gcc_assert (!new_bb);
5031 else
5032 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5035 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5036 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5037 vec_dest =
5038 vect_create_destination_var (scalar_dest,
5039 gimple_call_return_type (new_stmt));
5040 new_temp = make_ssa_name (vec_dest, new_stmt);
5041 gimple_call_set_lhs (new_stmt, new_temp);
5043 if (compute_in_loop)
5044 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5045 else
5047 /* Generate the misalignment computation outside LOOP. */
5048 pe = loop_preheader_edge (loop);
5049 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5050 gcc_assert (!new_bb);
5053 *realignment_token = gimple_call_lhs (new_stmt);
5055 /* The result of the CALL_EXPR to this builtin is determined from
5056 the value of the parameter and no global variables are touched
5057 which makes the builtin a "const" function. Requiring the
5058 builtin to have the "const" attribute makes it unnecessary
5059 to call mark_call_clobbered. */
5060 gcc_assert (TREE_READONLY (builtin_decl));
5063 if (alignment_support_scheme == dr_explicit_realign)
5064 return msq;
5066 gcc_assert (!compute_in_loop);
5067 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5070 /* 5. Create msq = phi <msq_init, lsq> in loop */
5072 pe = loop_preheader_edge (containing_loop);
5073 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5074 msq = make_ssa_name (vec_dest);
5075 phi_stmt = create_phi_node (msq, containing_loop->header);
5076 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5078 return msq;
5082 /* Function vect_grouped_load_supported.
5084 Returns TRUE if even and odd permutations are supported,
5085 and FALSE otherwise. */
5087 bool
5088 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5090 machine_mode mode = TYPE_MODE (vectype);
5092 /* vect_permute_load_chain requires the group size to be equal to 3 or
5093 be a power of two. */
5094 if (count != 3 && exact_log2 (count) == -1)
5096 if (dump_enabled_p ())
5097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5098 "the size of the group of accesses"
5099 " is not a power of 2 or not equal to 3\n");
5100 return false;
5103 /* Check that the permutation is supported. */
5104 if (VECTOR_MODE_P (mode))
5106 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5107 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5109 if (count == 3)
5111 unsigned int k;
5112 for (k = 0; k < 3; k++)
5114 for (i = 0; i < nelt; i++)
5115 if (3 * i + k < 2 * nelt)
5116 sel[i] = 3 * i + k;
5117 else
5118 sel[i] = 0;
5119 if (!can_vec_perm_p (mode, false, sel))
5121 if (dump_enabled_p ())
5122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5123 "shuffle of 3 loads is not supported by"
5124 " target\n");
5125 return false;
5127 for (i = 0, j = 0; i < nelt; i++)
5128 if (3 * i + k < 2 * nelt)
5129 sel[i] = i;
5130 else
5131 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5132 if (!can_vec_perm_p (mode, false, sel))
5134 if (dump_enabled_p ())
5135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5136 "shuffle of 3 loads is not supported by"
5137 " target\n");
5138 return false;
5141 return true;
5143 else
5145 /* If length is not equal to 3 then only power of 2 is supported. */
5146 gcc_assert (exact_log2 (count) != -1);
5147 for (i = 0; i < nelt; i++)
5148 sel[i] = i * 2;
5149 if (can_vec_perm_p (mode, false, sel))
5151 for (i = 0; i < nelt; i++)
5152 sel[i] = i * 2 + 1;
5153 if (can_vec_perm_p (mode, false, sel))
5154 return true;
5159 if (dump_enabled_p ())
5160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5161 "extract even/odd not supported by target\n");
5162 return false;
5165 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5166 type VECTYPE. */
5168 bool
5169 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5171 return vect_lanes_optab_supported_p ("vec_load_lanes",
5172 vec_load_lanes_optab,
5173 vectype, count);
5176 /* Function vect_permute_load_chain.
5178 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5179 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5180 the input data correctly. Return the final references for loads in
5181 RESULT_CHAIN.
5183 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5184 The input is 4 vectors each containing 8 elements. We assign a number to each
5185 element, the input sequence is:
5187 1st vec: 0 1 2 3 4 5 6 7
5188 2nd vec: 8 9 10 11 12 13 14 15
5189 3rd vec: 16 17 18 19 20 21 22 23
5190 4th vec: 24 25 26 27 28 29 30 31
5192 The output sequence should be:
5194 1st vec: 0 4 8 12 16 20 24 28
5195 2nd vec: 1 5 9 13 17 21 25 29
5196 3rd vec: 2 6 10 14 18 22 26 30
5197 4th vec: 3 7 11 15 19 23 27 31
5199 i.e., the first output vector should contain the first elements of each
5200 interleaving group, etc.
5202 We use extract_even/odd instructions to create such output. The input of
5203 each extract_even/odd operation is two vectors
5204 1st vec 2nd vec
5205 0 1 2 3 4 5 6 7
5207 and the output is the vector of extracted even/odd elements. The output of
5208 extract_even will be: 0 2 4 6
5209 and of extract_odd: 1 3 5 7
5212 The permutation is done in log LENGTH stages. In each stage extract_even
5213 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5214 their order. In our example,
5216 E1: extract_even (1st vec, 2nd vec)
5217 E2: extract_odd (1st vec, 2nd vec)
5218 E3: extract_even (3rd vec, 4th vec)
5219 E4: extract_odd (3rd vec, 4th vec)
5221 The output for the first stage will be:
5223 E1: 0 2 4 6 8 10 12 14
5224 E2: 1 3 5 7 9 11 13 15
5225 E3: 16 18 20 22 24 26 28 30
5226 E4: 17 19 21 23 25 27 29 31
5228 In order to proceed and create the correct sequence for the next stage (or
5229 for the correct output, if the second stage is the last one, as in our
5230 example), we first put the output of extract_even operation and then the
5231 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5232 The input for the second stage is:
5234 1st vec (E1): 0 2 4 6 8 10 12 14
5235 2nd vec (E3): 16 18 20 22 24 26 28 30
5236 3rd vec (E2): 1 3 5 7 9 11 13 15
5237 4th vec (E4): 17 19 21 23 25 27 29 31
5239 The output of the second stage:
5241 E1: 0 4 8 12 16 20 24 28
5242 E2: 2 6 10 14 18 22 26 30
5243 E3: 1 5 9 13 17 21 25 29
5244 E4: 3 7 11 15 19 23 27 31
5246 And RESULT_CHAIN after reordering:
5248 1st vec (E1): 0 4 8 12 16 20 24 28
5249 2nd vec (E3): 1 5 9 13 17 21 25 29
5250 3rd vec (E2): 2 6 10 14 18 22 26 30
5251 4th vec (E4): 3 7 11 15 19 23 27 31. */
5253 static void
5254 vect_permute_load_chain (vec<tree> dr_chain,
5255 unsigned int length,
5256 gimple *stmt,
5257 gimple_stmt_iterator *gsi,
5258 vec<tree> *result_chain)
5260 tree data_ref, first_vect, second_vect;
5261 tree perm_mask_even, perm_mask_odd;
5262 tree perm3_mask_low, perm3_mask_high;
5263 gimple *perm_stmt;
5264 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5265 unsigned int i, j, log_length = exact_log2 (length);
5266 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5267 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5269 result_chain->quick_grow (length);
5270 memcpy (result_chain->address (), dr_chain.address (),
5271 length * sizeof (tree));
5273 if (length == 3)
5275 unsigned int k;
5277 for (k = 0; k < 3; k++)
5279 for (i = 0; i < nelt; i++)
5280 if (3 * i + k < 2 * nelt)
5281 sel[i] = 3 * i + k;
5282 else
5283 sel[i] = 0;
5284 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5286 for (i = 0, j = 0; i < nelt; i++)
5287 if (3 * i + k < 2 * nelt)
5288 sel[i] = i;
5289 else
5290 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5292 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5294 first_vect = dr_chain[0];
5295 second_vect = dr_chain[1];
5297 /* Create interleaving stmt (low part of):
5298 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5299 ...}> */
5300 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5301 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5302 second_vect, perm3_mask_low);
5303 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5305 /* Create interleaving stmt (high part of):
5306 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5307 ...}> */
5308 first_vect = data_ref;
5309 second_vect = dr_chain[2];
5310 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5311 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5312 second_vect, perm3_mask_high);
5313 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5314 (*result_chain)[k] = data_ref;
5317 else
5319 /* If length is not equal to 3 then only power of 2 is supported. */
5320 gcc_assert (exact_log2 (length) != -1);
5322 for (i = 0; i < nelt; ++i)
5323 sel[i] = i * 2;
5324 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5326 for (i = 0; i < nelt; ++i)
5327 sel[i] = i * 2 + 1;
5328 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5330 for (i = 0; i < log_length; i++)
5332 for (j = 0; j < length; j += 2)
5334 first_vect = dr_chain[j];
5335 second_vect = dr_chain[j+1];
5337 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5338 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5339 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5340 first_vect, second_vect,
5341 perm_mask_even);
5342 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5343 (*result_chain)[j/2] = data_ref;
5345 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5346 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5347 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5348 first_vect, second_vect,
5349 perm_mask_odd);
5350 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5351 (*result_chain)[j/2+length/2] = data_ref;
5353 memcpy (dr_chain.address (), result_chain->address (),
5354 length * sizeof (tree));
5359 /* Function vect_shift_permute_load_chain.
5361 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5362 sequence of stmts to reorder the input data accordingly.
5363 Return the final references for loads in RESULT_CHAIN.
5364 Return true if successed, false otherwise.
5366 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5367 The input is 3 vectors each containing 8 elements. We assign a
5368 number to each element, the input sequence is:
5370 1st vec: 0 1 2 3 4 5 6 7
5371 2nd vec: 8 9 10 11 12 13 14 15
5372 3rd vec: 16 17 18 19 20 21 22 23
5374 The output sequence should be:
5376 1st vec: 0 3 6 9 12 15 18 21
5377 2nd vec: 1 4 7 10 13 16 19 22
5378 3rd vec: 2 5 8 11 14 17 20 23
5380 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5382 First we shuffle all 3 vectors to get correct elements order:
5384 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5385 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5386 3rd vec: (16 19 22) (17 20 23) (18 21)
5388 Next we unite and shift vector 3 times:
5390 1st step:
5391 shift right by 6 the concatenation of:
5392 "1st vec" and "2nd vec"
5393 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5394 "2nd vec" and "3rd vec"
5395 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5396 "3rd vec" and "1st vec"
5397 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5398 | New vectors |
5400 So that now new vectors are:
5402 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5403 2nd vec: (10 13) (16 19 22) (17 20 23)
5404 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5406 2nd step:
5407 shift right by 5 the concatenation of:
5408 "1st vec" and "3rd vec"
5409 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5410 "2nd vec" and "1st vec"
5411 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5412 "3rd vec" and "2nd vec"
5413 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5414 | New vectors |
5416 So that now new vectors are:
5418 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5419 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5420 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5422 3rd step:
5423 shift right by 5 the concatenation of:
5424 "1st vec" and "1st vec"
5425 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5426 shift right by 3 the concatenation of:
5427 "2nd vec" and "2nd vec"
5428 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5429 | New vectors |
5431 So that now all vectors are READY:
5432 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5433 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5434 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5436 This algorithm is faster than one in vect_permute_load_chain if:
5437 1. "shift of a concatination" is faster than general permutation.
5438 This is usually so.
5439 2. The TARGET machine can't execute vector instructions in parallel.
5440 This is because each step of the algorithm depends on previous.
5441 The algorithm in vect_permute_load_chain is much more parallel.
5443 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5446 static bool
5447 vect_shift_permute_load_chain (vec<tree> dr_chain,
5448 unsigned int length,
5449 gimple *stmt,
5450 gimple_stmt_iterator *gsi,
5451 vec<tree> *result_chain)
5453 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5454 tree perm2_mask1, perm2_mask2, perm3_mask;
5455 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5456 gimple *perm_stmt;
5458 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5459 unsigned int i;
5460 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5461 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5462 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5463 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5465 result_chain->quick_grow (length);
5466 memcpy (result_chain->address (), dr_chain.address (),
5467 length * sizeof (tree));
5469 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5471 unsigned int j, log_length = exact_log2 (length);
5472 for (i = 0; i < nelt / 2; ++i)
5473 sel[i] = i * 2;
5474 for (i = 0; i < nelt / 2; ++i)
5475 sel[nelt / 2 + i] = i * 2 + 1;
5476 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5478 if (dump_enabled_p ())
5479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5480 "shuffle of 2 fields structure is not \
5481 supported by target\n");
5482 return false;
5484 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5486 for (i = 0; i < nelt / 2; ++i)
5487 sel[i] = i * 2 + 1;
5488 for (i = 0; i < nelt / 2; ++i)
5489 sel[nelt / 2 + i] = i * 2;
5490 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5492 if (dump_enabled_p ())
5493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5494 "shuffle of 2 fields structure is not \
5495 supported by target\n");
5496 return false;
5498 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5500 /* Generating permutation constant to shift all elements.
5501 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5502 for (i = 0; i < nelt; i++)
5503 sel[i] = nelt / 2 + i;
5504 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5506 if (dump_enabled_p ())
5507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5508 "shift permutation is not supported by target\n");
5509 return false;
5511 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5513 /* Generating permutation constant to select vector from 2.
5514 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5515 for (i = 0; i < nelt / 2; i++)
5516 sel[i] = i;
5517 for (i = nelt / 2; i < nelt; i++)
5518 sel[i] = nelt + i;
5519 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5521 if (dump_enabled_p ())
5522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5523 "select is not supported by target\n");
5524 return false;
5526 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5528 for (i = 0; i < log_length; i++)
5530 for (j = 0; j < length; j += 2)
5532 first_vect = dr_chain[j];
5533 second_vect = dr_chain[j + 1];
5535 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5536 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5537 first_vect, first_vect,
5538 perm2_mask1);
5539 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5540 vect[0] = data_ref;
5542 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5543 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5544 second_vect, second_vect,
5545 perm2_mask2);
5546 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5547 vect[1] = data_ref;
5549 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5550 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5551 vect[0], vect[1], shift1_mask);
5552 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5553 (*result_chain)[j/2 + length/2] = data_ref;
5555 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5556 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5557 vect[0], vect[1], select_mask);
5558 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5559 (*result_chain)[j/2] = data_ref;
5561 memcpy (dr_chain.address (), result_chain->address (),
5562 length * sizeof (tree));
5564 return true;
5566 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5568 unsigned int k = 0, l = 0;
5570 /* Generating permutation constant to get all elements in rigth order.
5571 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5572 for (i = 0; i < nelt; i++)
5574 if (3 * k + (l % 3) >= nelt)
5576 k = 0;
5577 l += (3 - (nelt % 3));
5579 sel[i] = 3 * k + (l % 3);
5580 k++;
5582 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5584 if (dump_enabled_p ())
5585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5586 "shuffle of 3 fields structure is not \
5587 supported by target\n");
5588 return false;
5590 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5592 /* Generating permutation constant to shift all elements.
5593 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5594 for (i = 0; i < nelt; i++)
5595 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5596 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5598 if (dump_enabled_p ())
5599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5600 "shift permutation is not supported by target\n");
5601 return false;
5603 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5605 /* Generating permutation constant to shift all elements.
5606 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5607 for (i = 0; i < nelt; i++)
5608 sel[i] = 2 * (nelt / 3) + 1 + i;
5609 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5611 if (dump_enabled_p ())
5612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5613 "shift permutation is not supported by target\n");
5614 return false;
5616 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5618 /* Generating permutation constant to shift all elements.
5619 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5620 for (i = 0; i < nelt; i++)
5621 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5622 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5624 if (dump_enabled_p ())
5625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5626 "shift permutation is not supported by target\n");
5627 return false;
5629 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5631 /* Generating permutation constant to shift all elements.
5632 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5633 for (i = 0; i < nelt; i++)
5634 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5635 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5637 if (dump_enabled_p ())
5638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5639 "shift permutation is not supported by target\n");
5640 return false;
5642 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5644 for (k = 0; k < 3; k++)
5646 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5647 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5648 dr_chain[k], dr_chain[k],
5649 perm3_mask);
5650 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5651 vect[k] = data_ref;
5654 for (k = 0; k < 3; k++)
5656 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5657 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5658 vect[k % 3], vect[(k + 1) % 3],
5659 shift1_mask);
5660 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5661 vect_shift[k] = data_ref;
5664 for (k = 0; k < 3; k++)
5666 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5667 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5668 vect_shift[(4 - k) % 3],
5669 vect_shift[(3 - k) % 3],
5670 shift2_mask);
5671 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5672 vect[k] = data_ref;
5675 (*result_chain)[3 - (nelt % 3)] = vect[2];
5677 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5678 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5679 vect[0], shift3_mask);
5680 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5681 (*result_chain)[nelt % 3] = data_ref;
5683 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5684 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5685 vect[1], shift4_mask);
5686 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5687 (*result_chain)[0] = data_ref;
5688 return true;
5690 return false;
5693 /* Function vect_transform_grouped_load.
5695 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5696 to perform their permutation and ascribe the result vectorized statements to
5697 the scalar statements.
5700 void
5701 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5702 gimple_stmt_iterator *gsi)
5704 machine_mode mode;
5705 vec<tree> result_chain = vNULL;
5707 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5708 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5709 vectors, that are ready for vector computation. */
5710 result_chain.create (size);
5712 /* If reassociation width for vector type is 2 or greater target machine can
5713 execute 2 or more vector instructions in parallel. Otherwise try to
5714 get chain for loads group using vect_shift_permute_load_chain. */
5715 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5716 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5717 || exact_log2 (size) != -1
5718 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5719 gsi, &result_chain))
5720 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5721 vect_record_grouped_load_vectors (stmt, result_chain);
5722 result_chain.release ();
5725 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5726 generated as part of the vectorization of STMT. Assign the statement
5727 for each vector to the associated scalar statement. */
5729 void
5730 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5732 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5733 gimple *next_stmt, *new_stmt;
5734 unsigned int i, gap_count;
5735 tree tmp_data_ref;
5737 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5738 Since we scan the chain starting from it's first node, their order
5739 corresponds the order of data-refs in RESULT_CHAIN. */
5740 next_stmt = first_stmt;
5741 gap_count = 1;
5742 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5744 if (!next_stmt)
5745 break;
5747 /* Skip the gaps. Loads created for the gaps will be removed by dead
5748 code elimination pass later. No need to check for the first stmt in
5749 the group, since it always exists.
5750 GROUP_GAP is the number of steps in elements from the previous
5751 access (if there is no gap GROUP_GAP is 1). We skip loads that
5752 correspond to the gaps. */
5753 if (next_stmt != first_stmt
5754 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5756 gap_count++;
5757 continue;
5760 while (next_stmt)
5762 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5763 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5764 copies, and we put the new vector statement in the first available
5765 RELATED_STMT. */
5766 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5767 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5768 else
5770 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5772 gimple *prev_stmt =
5773 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5774 gimple *rel_stmt =
5775 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5776 while (rel_stmt)
5778 prev_stmt = rel_stmt;
5779 rel_stmt =
5780 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5783 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5784 new_stmt;
5788 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5789 gap_count = 1;
5790 /* If NEXT_STMT accesses the same DR as the previous statement,
5791 put the same TMP_DATA_REF as its vectorized statement; otherwise
5792 get the next data-ref from RESULT_CHAIN. */
5793 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5794 break;
5799 /* Function vect_force_dr_alignment_p.
5801 Returns whether the alignment of a DECL can be forced to be aligned
5802 on ALIGNMENT bit boundary. */
5804 bool
5805 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5807 if (TREE_CODE (decl) != VAR_DECL)
5808 return false;
5810 if (decl_in_symtab_p (decl)
5811 && !symtab_node::get (decl)->can_increase_alignment_p ())
5812 return false;
5814 if (TREE_STATIC (decl))
5815 return (alignment <= MAX_OFILE_ALIGNMENT);
5816 else
5817 return (alignment <= MAX_STACK_ALIGNMENT);
5821 /* Return whether the data reference DR is supported with respect to its
5822 alignment.
5823 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5824 it is aligned, i.e., check if it is possible to vectorize it with different
5825 alignment. */
5827 enum dr_alignment_support
5828 vect_supportable_dr_alignment (struct data_reference *dr,
5829 bool check_aligned_accesses)
5831 gimple *stmt = DR_STMT (dr);
5832 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5833 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5834 machine_mode mode = TYPE_MODE (vectype);
5835 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5836 struct loop *vect_loop = NULL;
5837 bool nested_in_vect_loop = false;
5839 if (aligned_access_p (dr) && !check_aligned_accesses)
5840 return dr_aligned;
5842 /* For now assume all conditional loads/stores support unaligned
5843 access without any special code. */
5844 if (is_gimple_call (stmt)
5845 && gimple_call_internal_p (stmt)
5846 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5847 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5848 return dr_unaligned_supported;
5850 if (loop_vinfo)
5852 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5853 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5856 /* Possibly unaligned access. */
5858 /* We can choose between using the implicit realignment scheme (generating
5859 a misaligned_move stmt) and the explicit realignment scheme (generating
5860 aligned loads with a REALIGN_LOAD). There are two variants to the
5861 explicit realignment scheme: optimized, and unoptimized.
5862 We can optimize the realignment only if the step between consecutive
5863 vector loads is equal to the vector size. Since the vector memory
5864 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5865 is guaranteed that the misalignment amount remains the same throughout the
5866 execution of the vectorized loop. Therefore, we can create the
5867 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5868 at the loop preheader.
5870 However, in the case of outer-loop vectorization, when vectorizing a
5871 memory access in the inner-loop nested within the LOOP that is now being
5872 vectorized, while it is guaranteed that the misalignment of the
5873 vectorized memory access will remain the same in different outer-loop
5874 iterations, it is *not* guaranteed that is will remain the same throughout
5875 the execution of the inner-loop. This is because the inner-loop advances
5876 with the original scalar step (and not in steps of VS). If the inner-loop
5877 step happens to be a multiple of VS, then the misalignment remains fixed
5878 and we can use the optimized realignment scheme. For example:
5880 for (i=0; i<N; i++)
5881 for (j=0; j<M; j++)
5882 s += a[i+j];
5884 When vectorizing the i-loop in the above example, the step between
5885 consecutive vector loads is 1, and so the misalignment does not remain
5886 fixed across the execution of the inner-loop, and the realignment cannot
5887 be optimized (as illustrated in the following pseudo vectorized loop):
5889 for (i=0; i<N; i+=4)
5890 for (j=0; j<M; j++){
5891 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5892 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5893 // (assuming that we start from an aligned address).
5896 We therefore have to use the unoptimized realignment scheme:
5898 for (i=0; i<N; i+=4)
5899 for (j=k; j<M; j+=4)
5900 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5901 // that the misalignment of the initial address is
5902 // 0).
5904 The loop can then be vectorized as follows:
5906 for (k=0; k<4; k++){
5907 rt = get_realignment_token (&vp[k]);
5908 for (i=0; i<N; i+=4){
5909 v1 = vp[i+k];
5910 for (j=k; j<M; j+=4){
5911 v2 = vp[i+j+VS-1];
5912 va = REALIGN_LOAD <v1,v2,rt>;
5913 vs += va;
5914 v1 = v2;
5917 } */
5919 if (DR_IS_READ (dr))
5921 bool is_packed = false;
5922 tree type = (TREE_TYPE (DR_REF (dr)));
5924 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5925 && (!targetm.vectorize.builtin_mask_for_load
5926 || targetm.vectorize.builtin_mask_for_load ()))
5928 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5929 if ((nested_in_vect_loop
5930 && (TREE_INT_CST_LOW (DR_STEP (dr))
5931 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5932 || !loop_vinfo)
5933 return dr_explicit_realign;
5934 else
5935 return dr_explicit_realign_optimized;
5937 if (!known_alignment_for_access_p (dr))
5938 is_packed = not_size_aligned (DR_REF (dr));
5940 if ((TYPE_USER_ALIGN (type) && !is_packed)
5941 || targetm.vectorize.
5942 support_vector_misalignment (mode, type,
5943 DR_MISALIGNMENT (dr), is_packed))
5944 /* Can't software pipeline the loads, but can at least do them. */
5945 return dr_unaligned_supported;
5947 else
5949 bool is_packed = false;
5950 tree type = (TREE_TYPE (DR_REF (dr)));
5952 if (!known_alignment_for_access_p (dr))
5953 is_packed = not_size_aligned (DR_REF (dr));
5955 if ((TYPE_USER_ALIGN (type) && !is_packed)
5956 || targetm.vectorize.
5957 support_vector_misalignment (mode, type,
5958 DR_MISALIGNMENT (dr), is_packed))
5959 return dr_unaligned_supported;
5962 /* Unsupported. */
5963 return dr_unaligned_unsupported;