S/390: Testsuite: Add asm scan patterns for -m31.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob7962e360fb9f5d9b0bc7473df223ff360c4de35c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
60 machine_mode mode, array_mode;
61 bool limit_p;
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
68 if (array_mode == BLKmode)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
91 return true;
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
162 if (optimize_loop_nest_for_size_p (loop))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
270 return true;
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 return true;
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
338 int dist = dist_v[loop_depth];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
344 if (dist == 0)
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
390 continue;
393 if (dist > 0 && DDR_REVERSED_P (ddr))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
423 if (abs (dist) >= *max_vf)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
444 return true;
447 return false;
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
459 unsigned int i;
460 struct data_dependence_relation *ddr;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
479 return true;
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
503 if (dra == drb)
504 return false;
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
540 return true;
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 vec<gimple *> stores, gimple *last_store)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
554 group. */
555 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
558 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559 if (access == last_access)
560 continue;
561 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
565 gimple *stmt = gsi_stmt (gsi);
566 if (! gimple_vuse (stmt)
567 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 continue;
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 if (!dr_b)
576 return false;
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt))
584 if (stmt != last_store)
585 continue;
586 unsigned i;
587 gimple *store;
588 FOR_EACH_VEC_ELT (stores, i, store)
590 data_reference *store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 ddr_p ddr = initialize_data_dependence_relation
593 (dr_a, store_dr, vNULL);
594 if (vect_slp_analyze_data_ref_dependence (ddr))
596 free_dependence_relation (ddr);
597 return false;
599 free_dependence_relation (ddr);
603 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 if (vect_slp_analyze_data_ref_dependence (ddr))
606 free_dependence_relation (ddr);
607 return false;
609 free_dependence_relation (ddr);
612 return true;
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store = SLP_INSTANCE_TREE (instance);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632 store = NULL;
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple *last_store = NULL;
636 if (store)
638 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 return false;
641 /* Mark stores in this instance and remember the last one. */
642 last_store = vect_find_last_scalar_stmt_in_slp (store);
643 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
647 bool res = true;
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
651 slp_tree load;
652 unsigned int i;
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654 if (! vect_slp_analyze_node_dependences (instance, load,
655 store
656 ? SLP_TREE_SCALAR_STMTS (store)
657 : vNULL, last_store))
659 res = false;
660 break;
663 /* Unset the visited flag. */
664 if (store)
665 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
668 return res;
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
675 Output:
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
686 gimple *stmt = DR_STMT (dr);
687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689 struct loop *loop = NULL;
690 tree ref = DR_REF (dr);
691 tree vectype;
692 tree base, base_addr;
693 tree misalign = NULL_TREE;
694 tree aligned_to;
695 unsigned HOST_WIDE_INT alignment;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE, vect_location,
699 "vect_compute_data_ref_alignment:\n");
701 if (loop_vinfo)
702 loop = LOOP_VINFO_LOOP (loop_vinfo);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr)))
708 misalign = DR_INIT (dr);
709 aligned_to = DR_ALIGNED_TO (dr);
710 base_addr = DR_BASE_ADDRESS (dr);
711 vectype = STMT_VINFO_VECTYPE (stmt_info);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop && nested_in_vect_loop_p (loop, stmt))
721 tree step = DR_STEP (dr);
723 if (tree_fits_shwi_p (step)
724 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "inner step divides the vector-size.\n");
729 misalign = STMT_VINFO_DR_INIT (stmt_info);
730 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
731 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
733 else
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "inner step doesn't divide the vector-size.\n");
738 misalign = NULL_TREE;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
746 else
748 tree step = DR_STEP (dr);
749 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
751 if (tree_fits_shwi_p (step)
752 && ((tree_to_shwi (step) * vf)
753 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "step doesn't divide the vector-size.\n");
758 misalign = NULL_TREE;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
764 base = ref;
765 while (handled_component_p (base))
766 base = TREE_OPERAND (base, 0);
767 if (TREE_CODE (base) == MEM_REF)
768 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
770 unsigned int base_alignment = get_object_alignment (base);
772 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
773 DR_VECT_AUX (dr)->base_element_aligned = true;
775 alignment = TYPE_ALIGN_UNIT (vectype);
777 if ((compare_tree_int (aligned_to, alignment) < 0)
778 || !misalign)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
790 if (base_alignment < TYPE_ALIGN (vectype))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base, 1))
795 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
796 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE, vect_location,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
805 dump_printf (MSG_NOTE, "\n");
807 return true;
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
816 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
817 dump_printf (MSG_NOTE, "\n");
820 DR_VECT_AUX (dr)->base_decl = base;
821 DR_VECT_AUX (dr)->base_misaligned = true;
822 DR_VECT_AUX (dr)->base_element_aligned = true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
830 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
834 /* PLUS because DR_STEP was negative. */
835 misalign = size_binop (PLUS_EXPR, misalign, offset);
838 SET_DR_MISALIGNMENT (dr,
839 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
846 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
849 return true;
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
861 static void
862 vect_update_misalignment_for_peel (struct data_reference *dr,
863 struct data_reference *dr_peel, int npeel)
865 unsigned int i;
866 vec<dr_p> same_align_drs;
867 struct data_reference *current_dr;
868 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
869 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
870 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
871 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
876 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
878 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
882 same_align_drs
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
884 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
886 if (current_dr != dr)
887 continue;
888 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
889 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
890 SET_DR_MISALIGNMENT (dr, 0);
891 return;
894 if (known_alignment_for_access_p (dr)
895 && known_alignment_for_access_p (dr_peel))
897 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
898 int misal = DR_MISALIGNMENT (dr);
899 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
900 misal += negative ? -npeel * dr_size : npeel * dr_size;
901 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
902 SET_DR_MISALIGNMENT (dr, misal);
903 return;
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
916 static bool
917 verify_data_ref_alignment (data_reference_p dr)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr, false);
921 if (!supportable_dr_alignment)
923 if (dump_enabled_p ())
925 if (DR_IS_READ (dr))
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
927 "not vectorized: unsupported unaligned load.");
928 else
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "not vectorized: unsupported unaligned "
931 "store.");
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
934 DR_REF (dr));
935 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
937 return false;
940 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location,
942 "Vectorizing an unaligned access.\n");
944 return true;
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
952 bool
953 vect_verify_datarefs_alignment (loop_vec_info vinfo)
955 vec<data_reference_p> datarefs = vinfo->datarefs;
956 struct data_reference *dr;
957 unsigned int i;
959 FOR_EACH_VEC_ELT (datarefs, i, dr)
961 gimple *stmt = DR_STMT (dr);
962 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
965 continue;
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
969 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
970 continue;
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
976 continue;
978 if (! verify_data_ref_alignment (dr))
979 return false;
982 return true;
985 /* Given an memory reference EXP return whether its alignment is less
986 than its size. */
988 static bool
989 not_size_aligned (tree exp)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
992 return true;
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
995 > get_object_alignment (exp));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1003 static bool
1004 vector_alignment_reachable_p (struct data_reference *dr)
1006 gimple *stmt = DR_STMT (dr);
1007 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size, mis_in_elements;
1016 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr))
1020 return false;
1022 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1023 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1025 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1026 return false;
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1033 HOST_WIDE_INT elmsize =
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE, vect_location,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1039 dump_printf (MSG_NOTE,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1042 if (DR_MISALIGNMENT (dr) % elmsize)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "data size does not divide the misalignment.\n");
1047 return false;
1051 if (!known_alignment_for_access_p (dr))
1053 tree type = TREE_TYPE (DR_REF (dr));
1054 bool is_packed = not_size_aligned (DR_REF (dr));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Unknown misalignment, is_packed = %d\n",is_packed);
1058 if ((TYPE_USER_ALIGN (type) && !is_packed)
1059 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1060 return true;
1061 else
1062 return false;
1065 return true;
1069 /* Calculate the cost of the memory access represented by DR. */
1071 static void
1072 vect_get_data_access_cost (struct data_reference *dr,
1073 unsigned int *inside_cost,
1074 unsigned int *outside_cost,
1075 stmt_vector_for_cost *body_cost_vec)
1077 gimple *stmt = DR_STMT (dr);
1078 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1079 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1080 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1081 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1082 int ncopies = vf / nunits;
1084 if (DR_IS_READ (dr))
1085 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1086 NULL, body_cost_vec, false);
1087 else
1088 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1097 typedef struct _vect_peel_info
1099 int npeel;
1100 struct data_reference *dr;
1101 unsigned int count;
1102 } *vect_peel_info;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info;
1107 unsigned int inside_cost;
1108 unsigned int outside_cost;
1109 stmt_vector_for_cost body_cost_vec;
1110 } *vect_peel_extended_info;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1117 static inline hashval_t hash (const _vect_peel_info *);
1118 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1121 inline hashval_t
1122 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1124 return (hashval_t) peel_info->npeel;
1127 inline bool
1128 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1130 return (a->npeel == b->npeel);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1136 static void
1137 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1138 loop_vec_info loop_vinfo, struct data_reference *dr,
1139 int npeel)
1141 struct _vect_peel_info elem, *slot;
1142 _vect_peel_info **new_slot;
1143 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1145 elem.npeel = npeel;
1146 slot = peeling_htab->find (&elem);
1147 if (slot)
1148 slot->count++;
1149 else
1151 slot = XNEW (struct _vect_peel_info);
1152 slot->npeel = npeel;
1153 slot->dr = dr;
1154 slot->count = 1;
1155 new_slot = peeling_htab->find_slot (slot, INSERT);
1156 *new_slot = slot;
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1161 slot->count += VECT_MAX_COST;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1170 _vect_peel_extended_info *max)
1172 vect_peel_info elem = *slot;
1174 if (elem->count > max->peel_info.count
1175 || (elem->count == max->peel_info.count
1176 && max->peel_info.npeel > elem->npeel))
1178 max->peel_info.npeel = elem->npeel;
1179 max->peel_info.count = elem->count;
1180 max->peel_info.dr = elem->dr;
1183 return 1;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1192 _vect_peel_extended_info *min)
1194 vect_peel_info elem = *slot;
1195 int save_misalignment, dummy;
1196 unsigned int inside_cost = 0, outside_cost = 0, i;
1197 gimple *stmt = DR_STMT (elem->dr);
1198 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1199 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1200 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1201 struct data_reference *dr;
1202 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1204 prologue_cost_vec.create (2);
1205 body_cost_vec.create (2);
1206 epilogue_cost_vec.create (2);
1208 FOR_EACH_VEC_ELT (datarefs, i, dr)
1210 stmt = DR_STMT (dr);
1211 stmt_info = vinfo_for_stmt (stmt);
1212 /* For interleaving, only the alignment of the first access
1213 matters. */
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1215 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1216 continue;
1218 save_misalignment = DR_MISALIGNMENT (dr);
1219 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1220 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1221 &body_cost_vec);
1222 SET_DR_MISALIGNMENT (dr, save_misalignment);
1225 outside_cost += vect_get_known_peeling_cost
1226 (loop_vinfo, elem->npeel, &dummy,
1227 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1228 &prologue_cost_vec, &epilogue_cost_vec);
1230 /* Prologue and epilogue costs are added to the target model later.
1231 These costs depend only on the scalar iteration cost, the
1232 number of peeling iterations finally chosen, and the number of
1233 misaligned statements. So discard the information found here. */
1234 prologue_cost_vec.release ();
1235 epilogue_cost_vec.release ();
1237 if (inside_cost < min->inside_cost
1238 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1240 min->inside_cost = inside_cost;
1241 min->outside_cost = outside_cost;
1242 min->body_cost_vec.release ();
1243 min->body_cost_vec = body_cost_vec;
1244 min->peel_info.dr = elem->dr;
1245 min->peel_info.npeel = elem->npeel;
1247 else
1248 body_cost_vec.release ();
1250 return 1;
1254 /* Choose best peeling option by traversing peeling hash table and either
1255 choosing an option with the lowest cost (if cost model is enabled) or the
1256 option that aligns as many accesses as possible. */
1258 static struct data_reference *
1259 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1260 loop_vec_info loop_vinfo,
1261 unsigned int *npeel,
1262 stmt_vector_for_cost *body_cost_vec)
1264 struct _vect_peel_extended_info res;
1266 res.peel_info.dr = NULL;
1267 res.body_cost_vec = stmt_vector_for_cost ();
1269 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1271 res.inside_cost = INT_MAX;
1272 res.outside_cost = INT_MAX;
1273 peeling_htab->traverse <_vect_peel_extended_info *,
1274 vect_peeling_hash_get_lowest_cost> (&res);
1276 else
1278 res.peel_info.count = 0;
1279 peeling_htab->traverse <_vect_peel_extended_info *,
1280 vect_peeling_hash_get_most_frequent> (&res);
1283 *npeel = res.peel_info.npeel;
1284 *body_cost_vec = res.body_cost_vec;
1285 return res.peel_info.dr;
1289 /* Function vect_enhance_data_refs_alignment
1291 This pass will use loop versioning and loop peeling in order to enhance
1292 the alignment of data references in the loop.
1294 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1295 original loop is to be vectorized. Any other loops that are created by
1296 the transformations performed in this pass - are not supposed to be
1297 vectorized. This restriction will be relaxed.
1299 This pass will require a cost model to guide it whether to apply peeling
1300 or versioning or a combination of the two. For example, the scheme that
1301 intel uses when given a loop with several memory accesses, is as follows:
1302 choose one memory access ('p') which alignment you want to force by doing
1303 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1304 other accesses are not necessarily aligned, or (2) use loop versioning to
1305 generate one loop in which all accesses are aligned, and another loop in
1306 which only 'p' is necessarily aligned.
1308 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1309 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1310 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1312 Devising a cost model is the most critical aspect of this work. It will
1313 guide us on which access to peel for, whether to use loop versioning, how
1314 many versions to create, etc. The cost model will probably consist of
1315 generic considerations as well as target specific considerations (on
1316 powerpc for example, misaligned stores are more painful than misaligned
1317 loads).
1319 Here are the general steps involved in alignment enhancements:
1321 -- original loop, before alignment analysis:
1322 for (i=0; i<N; i++){
1323 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1324 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1327 -- After vect_compute_data_refs_alignment:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = 3
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- Possibility 1: we do loop versioning:
1334 if (p is aligned) {
1335 for (i=0; i<N; i++){ # loop 1A
1336 x = q[i]; # DR_MISALIGNMENT(q) = 3
1337 p[i] = y; # DR_MISALIGNMENT(p) = 0
1340 else {
1341 for (i=0; i<N; i++){ # loop 1B
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1347 -- Possibility 2: we do loop peeling:
1348 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1349 x = q[i];
1350 p[i] = y;
1352 for (i = 3; i < N; i++){ # loop 2A
1353 x = q[i]; # DR_MISALIGNMENT(q) = 0
1354 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1357 -- Possibility 3: combination of loop peeling and versioning:
1358 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1359 x = q[i];
1360 p[i] = y;
1362 if (p is aligned) {
1363 for (i = 3; i<N; i++){ # loop 3A
1364 x = q[i]; # DR_MISALIGNMENT(q) = 0
1365 p[i] = y; # DR_MISALIGNMENT(p) = 0
1368 else {
1369 for (i = 3; i<N; i++){ # loop 3B
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1375 These loops are later passed to loop_transform to be vectorized. The
1376 vectorizer will use the alignment information to guide the transformation
1377 (whether to generate regular loads/stores, or with special handling for
1378 misalignment). */
1380 bool
1381 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1383 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1384 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1385 enum dr_alignment_support supportable_dr_alignment;
1386 struct data_reference *dr0 = NULL, *first_store = NULL;
1387 struct data_reference *dr;
1388 unsigned int i, j;
1389 bool do_peeling = false;
1390 bool do_versioning = false;
1391 bool stat;
1392 gimple *stmt;
1393 stmt_vec_info stmt_info;
1394 unsigned int npeel = 0;
1395 bool all_misalignments_unknown = true;
1396 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1397 unsigned possible_npeel_number = 1;
1398 tree vectype;
1399 unsigned int nelements, mis, same_align_drs_max = 0;
1400 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1401 hash_table<peel_info_hasher> peeling_htab (1);
1403 if (dump_enabled_p ())
1404 dump_printf_loc (MSG_NOTE, vect_location,
1405 "=== vect_enhance_data_refs_alignment ===\n");
1407 /* Reset data so we can safely be called multiple times. */
1408 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1409 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1411 /* While cost model enhancements are expected in the future, the high level
1412 view of the code at this time is as follows:
1414 A) If there is a misaligned access then see if peeling to align
1415 this access can make all data references satisfy
1416 vect_supportable_dr_alignment. If so, update data structures
1417 as needed and return true.
1419 B) If peeling wasn't possible and there is a data reference with an
1420 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1421 then see if loop versioning checks can be used to make all data
1422 references satisfy vect_supportable_dr_alignment. If so, update
1423 data structures as needed and return true.
1425 C) If neither peeling nor versioning were successful then return false if
1426 any data reference does not satisfy vect_supportable_dr_alignment.
1428 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1430 Note, Possibility 3 above (which is peeling and versioning together) is not
1431 being done at this time. */
1433 /* (1) Peeling to force alignment. */
1435 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1436 Considerations:
1437 + How many accesses will become aligned due to the peeling
1438 - How many accesses will become unaligned due to the peeling,
1439 and the cost of misaligned accesses.
1440 - The cost of peeling (the extra runtime checks, the increase
1441 in code size). */
1443 FOR_EACH_VEC_ELT (datarefs, i, dr)
1445 stmt = DR_STMT (dr);
1446 stmt_info = vinfo_for_stmt (stmt);
1448 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1449 continue;
1451 /* For interleaving, only the alignment of the first access
1452 matters. */
1453 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1454 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1455 continue;
1457 /* For invariant accesses there is nothing to enhance. */
1458 if (integer_zerop (DR_STEP (dr)))
1459 continue;
1461 /* Strided accesses perform only component accesses, alignment is
1462 irrelevant for them. */
1463 if (STMT_VINFO_STRIDED_P (stmt_info)
1464 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1465 continue;
1467 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1468 do_peeling = vector_alignment_reachable_p (dr);
1469 if (do_peeling)
1471 if (known_alignment_for_access_p (dr))
1473 unsigned int npeel_tmp;
1474 bool negative = tree_int_cst_compare (DR_STEP (dr),
1475 size_zero_node) < 0;
1477 /* Save info about DR in the hash table. */
1478 vectype = STMT_VINFO_VECTYPE (stmt_info);
1479 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1480 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1481 TREE_TYPE (DR_REF (dr))));
1482 npeel_tmp = (negative
1483 ? (mis - nelements) : (nelements - mis))
1484 & (nelements - 1);
1486 /* For multiple types, it is possible that the bigger type access
1487 will have more than one peeling option. E.g., a loop with two
1488 types: one of size (vector size / 4), and the other one of
1489 size (vector size / 8). Vectorization factor will 8. If both
1490 access are misaligned by 3, the first one needs one scalar
1491 iteration to be aligned, and the second one needs 5. But the
1492 the first one will be aligned also by peeling 5 scalar
1493 iterations, and in that case both accesses will be aligned.
1494 Hence, except for the immediate peeling amount, we also want
1495 to try to add full vector size, while we don't exceed
1496 vectorization factor.
1497 We do this automtically for cost model, since we calculate cost
1498 for every peeling option. */
1499 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1501 if (STMT_SLP_TYPE (stmt_info))
1502 possible_npeel_number
1503 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1504 else
1505 possible_npeel_number = vf / nelements;
1508 /* Handle the aligned case. We may decide to align some other
1509 access, making DR unaligned. */
1510 if (DR_MISALIGNMENT (dr) == 0)
1512 npeel_tmp = 0;
1513 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514 possible_npeel_number++;
1517 for (j = 0; j < possible_npeel_number; j++)
1519 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1520 dr, npeel_tmp);
1521 npeel_tmp += nelements;
1524 all_misalignments_unknown = false;
1525 /* Data-ref that was chosen for the case that all the
1526 misalignments are unknown is not relevant anymore, since we
1527 have a data-ref with known alignment. */
1528 dr0 = NULL;
1530 else
1532 /* If we don't know any misalignment values, we prefer
1533 peeling for data-ref that has the maximum number of data-refs
1534 with the same alignment, unless the target prefers to align
1535 stores over load. */
1536 if (all_misalignments_unknown)
1538 unsigned same_align_drs
1539 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1540 if (!dr0
1541 || same_align_drs_max < same_align_drs)
1543 same_align_drs_max = same_align_drs;
1544 dr0 = dr;
1546 /* For data-refs with the same number of related
1547 accesses prefer the one where the misalign
1548 computation will be invariant in the outermost loop. */
1549 else if (same_align_drs_max == same_align_drs)
1551 struct loop *ivloop0, *ivloop;
1552 ivloop0 = outermost_invariant_loop_for_expr
1553 (loop, DR_BASE_ADDRESS (dr0));
1554 ivloop = outermost_invariant_loop_for_expr
1555 (loop, DR_BASE_ADDRESS (dr));
1556 if ((ivloop && !ivloop0)
1557 || (ivloop && ivloop0
1558 && flow_loop_nested_p (ivloop, ivloop0)))
1559 dr0 = dr;
1562 if (!first_store && DR_IS_WRITE (dr))
1563 first_store = dr;
1566 /* If there are both known and unknown misaligned accesses in the
1567 loop, we choose peeling amount according to the known
1568 accesses. */
1569 if (!supportable_dr_alignment)
1571 dr0 = dr;
1572 if (!first_store && DR_IS_WRITE (dr))
1573 first_store = dr;
1577 else
1579 if (!aligned_access_p (dr))
1581 if (dump_enabled_p ())
1582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1583 "vector alignment may not be reachable\n");
1584 break;
1589 /* Check if we can possibly peel the loop. */
1590 if (!vect_can_advance_ivs_p (loop_vinfo)
1591 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1592 || loop->inner)
1593 do_peeling = false;
1595 if (do_peeling
1596 && all_misalignments_unknown
1597 && vect_supportable_dr_alignment (dr0, false))
1599 /* Check if the target requires to prefer stores over loads, i.e., if
1600 misaligned stores are more expensive than misaligned loads (taking
1601 drs with same alignment into account). */
1602 if (first_store && DR_IS_READ (dr0))
1604 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1605 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1606 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1607 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1608 stmt_vector_for_cost dummy;
1609 dummy.create (2);
1611 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1612 &dummy);
1613 vect_get_data_access_cost (first_store, &store_inside_cost,
1614 &store_outside_cost, &dummy);
1616 dummy.release ();
1618 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1619 aligning the load DR0). */
1620 load_inside_penalty = store_inside_cost;
1621 load_outside_penalty = store_outside_cost;
1622 for (i = 0;
1623 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1624 DR_STMT (first_store))).iterate (i, &dr);
1625 i++)
1626 if (DR_IS_READ (dr))
1628 load_inside_penalty += load_inside_cost;
1629 load_outside_penalty += load_outside_cost;
1631 else
1633 load_inside_penalty += store_inside_cost;
1634 load_outside_penalty += store_outside_cost;
1637 /* Calculate the penalty for leaving DR0 unaligned (by
1638 aligning the FIRST_STORE). */
1639 store_inside_penalty = load_inside_cost;
1640 store_outside_penalty = load_outside_cost;
1641 for (i = 0;
1642 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1643 DR_STMT (dr0))).iterate (i, &dr);
1644 i++)
1645 if (DR_IS_READ (dr))
1647 store_inside_penalty += load_inside_cost;
1648 store_outside_penalty += load_outside_cost;
1650 else
1652 store_inside_penalty += store_inside_cost;
1653 store_outside_penalty += store_outside_cost;
1656 if (load_inside_penalty > store_inside_penalty
1657 || (load_inside_penalty == store_inside_penalty
1658 && load_outside_penalty > store_outside_penalty))
1659 dr0 = first_store;
1662 /* In case there are only loads with different unknown misalignments, use
1663 peeling only if it may help to align other accesses in the loop or
1664 if it may help improving load bandwith when we'd end up using
1665 unaligned loads. */
1666 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1667 if (!first_store
1668 && !STMT_VINFO_SAME_ALIGN_REFS (
1669 vinfo_for_stmt (DR_STMT (dr0))).length ()
1670 && (vect_supportable_dr_alignment (dr0, false)
1671 != dr_unaligned_supported
1672 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1673 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1674 do_peeling = false;
1677 if (do_peeling && !dr0)
1679 /* Peeling is possible, but there is no data access that is not supported
1680 unless aligned. So we try to choose the best possible peeling. */
1682 /* We should get here only if there are drs with known misalignment. */
1683 gcc_assert (!all_misalignments_unknown);
1685 /* Choose the best peeling from the hash table. */
1686 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1687 loop_vinfo, &npeel,
1688 &body_cost_vec);
1689 if (!dr0 || !npeel)
1690 do_peeling = false;
1693 if (do_peeling)
1695 stmt = DR_STMT (dr0);
1696 stmt_info = vinfo_for_stmt (stmt);
1697 vectype = STMT_VINFO_VECTYPE (stmt_info);
1698 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1700 if (known_alignment_for_access_p (dr0))
1702 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1703 size_zero_node) < 0;
1704 if (!npeel)
1706 /* Since it's known at compile time, compute the number of
1707 iterations in the peeled loop (the peeling factor) for use in
1708 updating DR_MISALIGNMENT values. The peeling factor is the
1709 vectorization factor minus the misalignment as an element
1710 count. */
1711 mis = DR_MISALIGNMENT (dr0);
1712 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1713 npeel = ((negative ? mis - nelements : nelements - mis)
1714 & (nelements - 1));
1717 /* For interleaved data access every iteration accesses all the
1718 members of the group, therefore we divide the number of iterations
1719 by the group size. */
1720 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1721 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1722 npeel /= GROUP_SIZE (stmt_info);
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "Try peeling by %d\n", npeel);
1729 /* Ensure that all data refs can be vectorized after the peel. */
1730 FOR_EACH_VEC_ELT (datarefs, i, dr)
1732 int save_misalignment;
1734 if (dr == dr0)
1735 continue;
1737 stmt = DR_STMT (dr);
1738 stmt_info = vinfo_for_stmt (stmt);
1739 /* For interleaving, only the alignment of the first access
1740 matters. */
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1743 continue;
1745 /* Strided accesses perform only component accesses, alignment is
1746 irrelevant for them. */
1747 if (STMT_VINFO_STRIDED_P (stmt_info)
1748 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1749 continue;
1751 save_misalignment = DR_MISALIGNMENT (dr);
1752 vect_update_misalignment_for_peel (dr, dr0, npeel);
1753 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1754 SET_DR_MISALIGNMENT (dr, save_misalignment);
1756 if (!supportable_dr_alignment)
1758 do_peeling = false;
1759 break;
1763 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1765 stat = vect_verify_datarefs_alignment (loop_vinfo);
1766 if (!stat)
1767 do_peeling = false;
1768 else
1770 body_cost_vec.release ();
1771 return stat;
1775 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1776 if (do_peeling)
1778 unsigned max_allowed_peel
1779 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1780 if (max_allowed_peel != (unsigned)-1)
1782 unsigned max_peel = npeel;
1783 if (max_peel == 0)
1785 gimple *dr_stmt = DR_STMT (dr0);
1786 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1787 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1788 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1790 if (max_peel > max_allowed_peel)
1792 do_peeling = false;
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_NOTE, vect_location,
1795 "Disable peeling, max peels reached: %d\n", max_peel);
1800 /* Cost model #2 - if peeling may result in a remaining loop not
1801 iterating enough to be vectorized then do not peel. */
1802 if (do_peeling
1803 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1805 unsigned max_peel
1806 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1807 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1808 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1809 do_peeling = false;
1812 if (do_peeling)
1814 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1815 If the misalignment of DR_i is identical to that of dr0 then set
1816 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1817 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1818 by the peeling factor times the element size of DR_i (MOD the
1819 vectorization factor times the size). Otherwise, the
1820 misalignment of DR_i must be set to unknown. */
1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
1822 if (dr != dr0)
1823 vect_update_misalignment_for_peel (dr, dr0, npeel);
1825 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1826 if (npeel)
1827 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1828 else
1829 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1830 = DR_MISALIGNMENT (dr0);
1831 SET_DR_MISALIGNMENT (dr0, 0);
1832 if (dump_enabled_p ())
1834 dump_printf_loc (MSG_NOTE, vect_location,
1835 "Alignment of access forced using peeling.\n");
1836 dump_printf_loc (MSG_NOTE, vect_location,
1837 "Peeling for alignment will be applied.\n");
1839 /* The inside-loop cost will be accounted for in vectorizable_load
1840 and vectorizable_store correctly with adjusted alignments.
1841 Drop the body_cst_vec on the floor here. */
1842 body_cost_vec.release ();
1844 stat = vect_verify_datarefs_alignment (loop_vinfo);
1845 gcc_assert (stat);
1846 return stat;
1850 body_cost_vec.release ();
1852 /* (2) Versioning to force alignment. */
1854 /* Try versioning if:
1855 1) optimize loop for speed
1856 2) there is at least one unsupported misaligned data ref with an unknown
1857 misalignment, and
1858 3) all misaligned data refs with a known misalignment are supported, and
1859 4) the number of runtime alignment checks is within reason. */
1861 do_versioning =
1862 optimize_loop_nest_for_speed_p (loop)
1863 && (!loop->inner); /* FORNOW */
1865 if (do_versioning)
1867 FOR_EACH_VEC_ELT (datarefs, i, dr)
1869 stmt = DR_STMT (dr);
1870 stmt_info = vinfo_for_stmt (stmt);
1872 /* For interleaving, only the alignment of the first access
1873 matters. */
1874 if (aligned_access_p (dr)
1875 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1876 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1877 continue;
1879 if (STMT_VINFO_STRIDED_P (stmt_info))
1881 /* Strided loads perform only component accesses, alignment is
1882 irrelevant for them. */
1883 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1884 continue;
1885 do_versioning = false;
1886 break;
1889 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1891 if (!supportable_dr_alignment)
1893 gimple *stmt;
1894 int mask;
1895 tree vectype;
1897 if (known_alignment_for_access_p (dr)
1898 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1899 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1901 do_versioning = false;
1902 break;
1905 stmt = DR_STMT (dr);
1906 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1907 gcc_assert (vectype);
1909 /* The rightmost bits of an aligned address must be zeros.
1910 Construct the mask needed for this test. For example,
1911 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1912 mask must be 15 = 0xf. */
1913 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1915 /* FORNOW: use the same mask to test all potentially unaligned
1916 references in the loop. The vectorizer currently supports
1917 a single vector size, see the reference to
1918 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1919 vectorization factor is computed. */
1920 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1921 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1922 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1923 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1924 DR_STMT (dr));
1928 /* Versioning requires at least one misaligned data reference. */
1929 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1930 do_versioning = false;
1931 else if (!do_versioning)
1932 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1935 if (do_versioning)
1937 vec<gimple *> may_misalign_stmts
1938 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1939 gimple *stmt;
1941 /* It can now be assumed that the data references in the statements
1942 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1943 of the loop being vectorized. */
1944 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1947 dr = STMT_VINFO_DATA_REF (stmt_info);
1948 SET_DR_MISALIGNMENT (dr, 0);
1949 if (dump_enabled_p ())
1950 dump_printf_loc (MSG_NOTE, vect_location,
1951 "Alignment of access forced using versioning.\n");
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE, vect_location,
1956 "Versioning for alignment will be applied.\n");
1958 /* Peeling and versioning can't be done together at this time. */
1959 gcc_assert (! (do_peeling && do_versioning));
1961 stat = vect_verify_datarefs_alignment (loop_vinfo);
1962 gcc_assert (stat);
1963 return stat;
1966 /* This point is reached if neither peeling nor versioning is being done. */
1967 gcc_assert (! (do_peeling || do_versioning));
1969 stat = vect_verify_datarefs_alignment (loop_vinfo);
1970 return stat;
1974 /* Function vect_find_same_alignment_drs.
1976 Update group and alignment relations according to the chosen
1977 vectorization factor. */
1979 static void
1980 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1981 loop_vec_info loop_vinfo)
1983 unsigned int i;
1984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1985 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1986 struct data_reference *dra = DDR_A (ddr);
1987 struct data_reference *drb = DDR_B (ddr);
1988 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1989 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1990 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1991 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1992 lambda_vector dist_v;
1993 unsigned int loop_depth;
1995 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1996 return;
1998 if (dra == drb)
1999 return;
2001 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2002 return;
2004 /* Loop-based vectorization and known data dependence. */
2005 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2006 return;
2008 /* Data-dependence analysis reports a distance vector of zero
2009 for data-references that overlap only in the first iteration
2010 but have different sign step (see PR45764).
2011 So as a sanity check require equal DR_STEP. */
2012 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2013 return;
2015 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2016 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2018 int dist = dist_v[loop_depth];
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE, vect_location,
2022 "dependence distance = %d.\n", dist);
2024 /* Same loop iteration. */
2025 if (dist == 0
2026 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2028 /* Two references with distance zero have the same alignment. */
2029 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2030 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2031 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "accesses have the same alignment.\n");
2035 dump_printf (MSG_NOTE,
2036 "dependence distance modulo vf == 0 between ");
2037 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2038 dump_printf (MSG_NOTE, " and ");
2039 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2040 dump_printf (MSG_NOTE, "\n");
2047 /* Function vect_analyze_data_refs_alignment
2049 Analyze the alignment of the data-references in the loop.
2050 Return FALSE if a data reference is found that cannot be vectorized. */
2052 bool
2053 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "=== vect_analyze_data_refs_alignment ===\n");
2059 /* Mark groups of data references with same alignment using
2060 data dependence information. */
2061 vec<ddr_p> ddrs = vinfo->ddrs;
2062 struct data_dependence_relation *ddr;
2063 unsigned int i;
2065 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2066 vect_find_same_alignment_drs (ddr, vinfo);
2068 vec<data_reference_p> datarefs = vinfo->datarefs;
2069 struct data_reference *dr;
2071 FOR_EACH_VEC_ELT (datarefs, i, dr)
2073 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2074 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2075 && !vect_compute_data_ref_alignment (dr))
2077 /* Strided accesses perform only component accesses, misalignment
2078 information is irrelevant for them. */
2079 if (STMT_VINFO_STRIDED_P (stmt_info)
2080 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2081 continue;
2083 if (dump_enabled_p ())
2084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2085 "not vectorized: can't calculate alignment "
2086 "for data ref.\n");
2088 return false;
2092 return true;
2096 /* Analyze alignment of DRs of stmts in NODE. */
2098 static bool
2099 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2101 /* We vectorize from the first scalar stmt in the node unless
2102 the node is permuted in which case we start from the first
2103 element in the group. */
2104 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2105 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2106 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2108 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2109 if (! vect_compute_data_ref_alignment (dr)
2110 || ! verify_data_ref_alignment (dr))
2112 if (dump_enabled_p ())
2113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2114 "not vectorized: bad data alignment in basic "
2115 "block.\n");
2116 return false;
2119 return true;
2122 /* Function vect_slp_analyze_instance_alignment
2124 Analyze the alignment of the data-references in the SLP instance.
2125 Return FALSE if a data reference is found that cannot be vectorized. */
2127 bool
2128 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_NOTE, vect_location,
2132 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2134 slp_tree node;
2135 unsigned i;
2136 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2137 if (! vect_slp_analyze_and_verify_node_alignment (node))
2138 return false;
2140 node = SLP_INSTANCE_TREE (instance);
2141 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2142 && ! vect_slp_analyze_and_verify_node_alignment
2143 (SLP_INSTANCE_TREE (instance)))
2144 return false;
2146 return true;
2150 /* Analyze groups of accesses: check that DR belongs to a group of
2151 accesses of legal size, step, etc. Detect gaps, single element
2152 interleaving, and other special cases. Set grouped access info.
2153 Collect groups of strided stores for further use in SLP analysis.
2154 Worker for vect_analyze_group_access. */
2156 static bool
2157 vect_analyze_group_access_1 (struct data_reference *dr)
2159 tree step = DR_STEP (dr);
2160 tree scalar_type = TREE_TYPE (DR_REF (dr));
2161 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2162 gimple *stmt = DR_STMT (dr);
2163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2164 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2165 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2166 HOST_WIDE_INT dr_step = -1;
2167 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2168 bool slp_impossible = false;
2170 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2171 size of the interleaving group (including gaps). */
2172 if (tree_fits_shwi_p (step))
2174 dr_step = tree_to_shwi (step);
2175 /* Check that STEP is a multiple of type size. Otherwise there is
2176 a non-element-sized gap at the end of the group which we
2177 cannot represent in GROUP_GAP or GROUP_SIZE.
2178 ??? As we can handle non-constant step fine here we should
2179 simply remove uses of GROUP_GAP between the last and first
2180 element and instead rely on DR_STEP. GROUP_SIZE then would
2181 simply not include that gap. */
2182 if ((dr_step % type_size) != 0)
2184 if (dump_enabled_p ())
2186 dump_printf_loc (MSG_NOTE, vect_location,
2187 "Step ");
2188 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2189 dump_printf (MSG_NOTE,
2190 " is not a multiple of the element size for ");
2191 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2192 dump_printf (MSG_NOTE, "\n");
2194 return false;
2196 groupsize = absu_hwi (dr_step) / type_size;
2198 else
2199 groupsize = 0;
2201 /* Not consecutive access is possible only if it is a part of interleaving. */
2202 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2204 /* Check if it this DR is a part of interleaving, and is a single
2205 element of the group that is accessed in the loop. */
2207 /* Gaps are supported only for loads. STEP must be a multiple of the type
2208 size. The size of the group must be a power of 2. */
2209 if (DR_IS_READ (dr)
2210 && (dr_step % type_size) == 0
2211 && groupsize > 0
2212 && exact_log2 (groupsize) != -1)
2214 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2215 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2216 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_NOTE, vect_location,
2219 "Detected single element interleaving ");
2220 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2221 dump_printf (MSG_NOTE, " step ");
2222 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2223 dump_printf (MSG_NOTE, "\n");
2226 return true;
2229 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2232 "not consecutive access ");
2233 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2236 if (bb_vinfo)
2238 /* Mark the statement as unvectorizable. */
2239 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2240 return true;
2243 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2244 STMT_VINFO_STRIDED_P (stmt_info) = true;
2245 return true;
2248 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2250 /* First stmt in the interleaving chain. Check the chain. */
2251 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2252 struct data_reference *data_ref = dr;
2253 unsigned int count = 1;
2254 tree prev_init = DR_INIT (data_ref);
2255 gimple *prev = stmt;
2256 HOST_WIDE_INT diff, gaps = 0;
2258 while (next)
2260 /* Skip same data-refs. In case that two or more stmts share
2261 data-ref (supported only for loads), we vectorize only the first
2262 stmt, and the rest get their vectorized loads from the first
2263 one. */
2264 if (!tree_int_cst_compare (DR_INIT (data_ref),
2265 DR_INIT (STMT_VINFO_DATA_REF (
2266 vinfo_for_stmt (next)))))
2268 if (DR_IS_WRITE (data_ref))
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272 "Two store stmts share the same dr.\n");
2273 return false;
2276 if (dump_enabled_p ())
2277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278 "Two or more load stmts share the same dr.\n");
2280 /* For load use the same data-ref load. */
2281 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2283 prev = next;
2284 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2285 continue;
2288 prev = next;
2289 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2291 /* All group members have the same STEP by construction. */
2292 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2294 /* Check that the distance between two accesses is equal to the type
2295 size. Otherwise, we have gaps. */
2296 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2297 - TREE_INT_CST_LOW (prev_init)) / type_size;
2298 if (diff != 1)
2300 /* FORNOW: SLP of accesses with gaps is not supported. */
2301 slp_impossible = true;
2302 if (DR_IS_WRITE (data_ref))
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306 "interleaved store with gaps\n");
2307 return false;
2310 gaps += diff - 1;
2313 last_accessed_element += diff;
2315 /* Store the gap from the previous member of the group. If there is no
2316 gap in the access, GROUP_GAP is always 1. */
2317 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2319 prev_init = DR_INIT (data_ref);
2320 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2321 /* Count the number of data-refs in the chain. */
2322 count++;
2325 if (groupsize == 0)
2326 groupsize = count + gaps;
2328 if (groupsize > UINT_MAX)
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2332 "group is too large\n");
2333 return false;
2336 /* Check that the size of the interleaving is equal to count for stores,
2337 i.e., that there are no gaps. */
2338 if (groupsize != count
2339 && !DR_IS_READ (dr))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "interleaved store with gaps\n");
2344 return false;
2347 /* If there is a gap after the last load in the group it is the
2348 difference between the groupsize and the last accessed
2349 element.
2350 When there is no gap, this difference should be 0. */
2351 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2353 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2354 if (dump_enabled_p ())
2356 dump_printf_loc (MSG_NOTE, vect_location,
2357 "Detected interleaving ");
2358 if (DR_IS_READ (dr))
2359 dump_printf (MSG_NOTE, "load ");
2360 else
2361 dump_printf (MSG_NOTE, "store ");
2362 dump_printf (MSG_NOTE, "of size %u starting with ",
2363 (unsigned)groupsize);
2364 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2365 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2366 dump_printf_loc (MSG_NOTE, vect_location,
2367 "There is a gap of %u elements after the group\n",
2368 GROUP_GAP (vinfo_for_stmt (stmt)));
2371 /* SLP: create an SLP data structure for every interleaving group of
2372 stores for further analysis in vect_analyse_slp. */
2373 if (DR_IS_WRITE (dr) && !slp_impossible)
2375 if (loop_vinfo)
2376 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2377 if (bb_vinfo)
2378 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2382 return true;
2385 /* Analyze groups of accesses: check that DR belongs to a group of
2386 accesses of legal size, step, etc. Detect gaps, single element
2387 interleaving, and other special cases. Set grouped access info.
2388 Collect groups of strided stores for further use in SLP analysis. */
2390 static bool
2391 vect_analyze_group_access (struct data_reference *dr)
2393 if (!vect_analyze_group_access_1 (dr))
2395 /* Dissolve the group if present. */
2396 gimple *next;
2397 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2398 while (stmt)
2400 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2401 next = GROUP_NEXT_ELEMENT (vinfo);
2402 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2403 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2404 stmt = next;
2406 return false;
2408 return true;
2411 /* Analyze the access pattern of the data-reference DR.
2412 In case of non-consecutive accesses call vect_analyze_group_access() to
2413 analyze groups of accesses. */
2415 static bool
2416 vect_analyze_data_ref_access (struct data_reference *dr)
2418 tree step = DR_STEP (dr);
2419 tree scalar_type = TREE_TYPE (DR_REF (dr));
2420 gimple *stmt = DR_STMT (dr);
2421 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2422 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2423 struct loop *loop = NULL;
2425 if (loop_vinfo)
2426 loop = LOOP_VINFO_LOOP (loop_vinfo);
2428 if (loop_vinfo && !step)
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "bad data-ref access in loop\n");
2433 return false;
2436 /* Allow loads with zero step in inner-loop vectorization. */
2437 if (loop_vinfo && integer_zerop (step))
2439 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2440 if (!nested_in_vect_loop_p (loop, stmt))
2441 return DR_IS_READ (dr);
2442 /* Allow references with zero step for outer loops marked
2443 with pragma omp simd only - it guarantees absence of
2444 loop-carried dependencies between inner loop iterations. */
2445 if (!loop->force_vectorize)
2447 if (dump_enabled_p ())
2448 dump_printf_loc (MSG_NOTE, vect_location,
2449 "zero step in inner loop of nest\n");
2450 return false;
2454 if (loop && nested_in_vect_loop_p (loop, stmt))
2456 /* Interleaved accesses are not yet supported within outer-loop
2457 vectorization for references in the inner-loop. */
2458 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2460 /* For the rest of the analysis we use the outer-loop step. */
2461 step = STMT_VINFO_DR_STEP (stmt_info);
2462 if (integer_zerop (step))
2464 if (dump_enabled_p ())
2465 dump_printf_loc (MSG_NOTE, vect_location,
2466 "zero step in outer loop.\n");
2467 return DR_IS_READ (dr);
2471 /* Consecutive? */
2472 if (TREE_CODE (step) == INTEGER_CST)
2474 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2475 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2476 || (dr_step < 0
2477 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2479 /* Mark that it is not interleaving. */
2480 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2481 return true;
2485 if (loop && nested_in_vect_loop_p (loop, stmt))
2487 if (dump_enabled_p ())
2488 dump_printf_loc (MSG_NOTE, vect_location,
2489 "grouped access in outer loop.\n");
2490 return false;
2494 /* Assume this is a DR handled by non-constant strided load case. */
2495 if (TREE_CODE (step) != INTEGER_CST)
2496 return (STMT_VINFO_STRIDED_P (stmt_info)
2497 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2498 || vect_analyze_group_access (dr)));
2500 /* Not consecutive access - check if it's a part of interleaving group. */
2501 return vect_analyze_group_access (dr);
2506 /* A helper function used in the comparator function to sort data
2507 references. T1 and T2 are two data references to be compared.
2508 The function returns -1, 0, or 1. */
2510 static int
2511 compare_tree (tree t1, tree t2)
2513 int i, cmp;
2514 enum tree_code code;
2515 char tclass;
2517 if (t1 == t2)
2518 return 0;
2519 if (t1 == NULL)
2520 return -1;
2521 if (t2 == NULL)
2522 return 1;
2524 STRIP_NOPS (t1);
2525 STRIP_NOPS (t2);
2527 if (TREE_CODE (t1) != TREE_CODE (t2))
2528 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2530 code = TREE_CODE (t1);
2531 switch (code)
2533 /* For const values, we can just use hash values for comparisons. */
2534 case INTEGER_CST:
2535 case REAL_CST:
2536 case FIXED_CST:
2537 case STRING_CST:
2538 case COMPLEX_CST:
2539 case VECTOR_CST:
2541 hashval_t h1 = iterative_hash_expr (t1, 0);
2542 hashval_t h2 = iterative_hash_expr (t2, 0);
2543 if (h1 != h2)
2544 return h1 < h2 ? -1 : 1;
2545 break;
2548 case SSA_NAME:
2549 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2550 if (cmp != 0)
2551 return cmp;
2553 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2554 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2555 break;
2557 default:
2558 tclass = TREE_CODE_CLASS (code);
2560 /* For var-decl, we could compare their UIDs. */
2561 if (tclass == tcc_declaration)
2563 if (DECL_UID (t1) != DECL_UID (t2))
2564 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2565 break;
2568 /* For expressions with operands, compare their operands recursively. */
2569 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2571 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2572 if (cmp != 0)
2573 return cmp;
2577 return 0;
2581 /* Compare two data-references DRA and DRB to group them into chunks
2582 suitable for grouping. */
2584 static int
2585 dr_group_sort_cmp (const void *dra_, const void *drb_)
2587 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2588 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2589 int cmp;
2591 /* Stabilize sort. */
2592 if (dra == drb)
2593 return 0;
2595 /* Ordering of DRs according to base. */
2596 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2598 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2599 if (cmp != 0)
2600 return cmp;
2603 /* And according to DR_OFFSET. */
2604 if (!dr_equal_offsets_p (dra, drb))
2606 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2607 if (cmp != 0)
2608 return cmp;
2611 /* Put reads before writes. */
2612 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2613 return DR_IS_READ (dra) ? -1 : 1;
2615 /* Then sort after access size. */
2616 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2617 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2619 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2620 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2621 if (cmp != 0)
2622 return cmp;
2625 /* And after step. */
2626 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2628 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2629 if (cmp != 0)
2630 return cmp;
2633 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2634 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2635 if (cmp == 0)
2636 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2637 return cmp;
2640 /* Function vect_analyze_data_ref_accesses.
2642 Analyze the access pattern of all the data references in the loop.
2644 FORNOW: the only access pattern that is considered vectorizable is a
2645 simple step 1 (consecutive) access.
2647 FORNOW: handle only arrays and pointer accesses. */
2649 bool
2650 vect_analyze_data_ref_accesses (vec_info *vinfo)
2652 unsigned int i;
2653 vec<data_reference_p> datarefs = vinfo->datarefs;
2654 struct data_reference *dr;
2656 if (dump_enabled_p ())
2657 dump_printf_loc (MSG_NOTE, vect_location,
2658 "=== vect_analyze_data_ref_accesses ===\n");
2660 if (datarefs.is_empty ())
2661 return true;
2663 /* Sort the array of datarefs to make building the interleaving chains
2664 linear. Don't modify the original vector's order, it is needed for
2665 determining what dependencies are reversed. */
2666 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2667 datarefs_copy.qsort (dr_group_sort_cmp);
2669 /* Build the interleaving chains. */
2670 for (i = 0; i < datarefs_copy.length () - 1;)
2672 data_reference_p dra = datarefs_copy[i];
2673 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2674 stmt_vec_info lastinfo = NULL;
2675 for (i = i + 1; i < datarefs_copy.length (); ++i)
2677 data_reference_p drb = datarefs_copy[i];
2678 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2680 /* ??? Imperfect sorting (non-compatible types, non-modulo
2681 accesses, same accesses) can lead to a group to be artificially
2682 split here as we don't just skip over those. If it really
2683 matters we can push those to a worklist and re-iterate
2684 over them. The we can just skip ahead to the next DR here. */
2686 /* Check that the data-refs have same first location (except init)
2687 and they are both either store or load (not load and store,
2688 not masked loads or stores). */
2689 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2690 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2691 DR_BASE_ADDRESS (drb), 0)
2692 || !dr_equal_offsets_p (dra, drb)
2693 || !gimple_assign_single_p (DR_STMT (dra))
2694 || !gimple_assign_single_p (DR_STMT (drb)))
2695 break;
2697 /* Check that the data-refs have the same constant size. */
2698 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2699 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2700 if (!tree_fits_uhwi_p (sza)
2701 || !tree_fits_uhwi_p (szb)
2702 || !tree_int_cst_equal (sza, szb))
2703 break;
2705 /* Check that the data-refs have the same step. */
2706 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2707 break;
2709 /* Do not place the same access in the interleaving chain twice. */
2710 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2711 break;
2713 /* Check the types are compatible.
2714 ??? We don't distinguish this during sorting. */
2715 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2716 TREE_TYPE (DR_REF (drb))))
2717 break;
2719 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2720 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2721 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2722 gcc_assert (init_a < init_b);
2724 /* If init_b == init_a + the size of the type * k, we have an
2725 interleaving, and DRA is accessed before DRB. */
2726 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2727 if (type_size_a == 0
2728 || (init_b - init_a) % type_size_a != 0)
2729 break;
2731 /* If we have a store, the accesses are adjacent. This splits
2732 groups into chunks we support (we don't support vectorization
2733 of stores with gaps). */
2734 if (!DR_IS_READ (dra)
2735 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2736 (DR_INIT (datarefs_copy[i-1]))
2737 != type_size_a))
2738 break;
2740 /* If the step (if not zero or non-constant) is greater than the
2741 difference between data-refs' inits this splits groups into
2742 suitable sizes. */
2743 if (tree_fits_shwi_p (DR_STEP (dra)))
2745 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2746 if (step != 0 && step <= (init_b - init_a))
2747 break;
2750 if (dump_enabled_p ())
2752 dump_printf_loc (MSG_NOTE, vect_location,
2753 "Detected interleaving ");
2754 if (DR_IS_READ (dra))
2755 dump_printf (MSG_NOTE, "load ");
2756 else
2757 dump_printf (MSG_NOTE, "store ");
2758 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2759 dump_printf (MSG_NOTE, " and ");
2760 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2761 dump_printf (MSG_NOTE, "\n");
2764 /* Link the found element into the group list. */
2765 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2767 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2768 lastinfo = stmtinfo_a;
2770 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2771 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2772 lastinfo = stmtinfo_b;
2776 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2777 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2778 && !vect_analyze_data_ref_access (dr))
2780 if (dump_enabled_p ())
2781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2782 "not vectorized: complicated access pattern.\n");
2784 if (is_a <bb_vec_info> (vinfo))
2786 /* Mark the statement as not vectorizable. */
2787 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2788 continue;
2790 else
2792 datarefs_copy.release ();
2793 return false;
2797 datarefs_copy.release ();
2798 return true;
2802 /* Operator == between two dr_with_seg_len objects.
2804 This equality operator is used to make sure two data refs
2805 are the same one so that we will consider to combine the
2806 aliasing checks of those two pairs of data dependent data
2807 refs. */
2809 static bool
2810 operator == (const dr_with_seg_len& d1,
2811 const dr_with_seg_len& d2)
2813 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2814 DR_BASE_ADDRESS (d2.dr), 0)
2815 && compare_tree (d1.offset, d2.offset) == 0
2816 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2819 /* Function comp_dr_with_seg_len_pair.
2821 Comparison function for sorting objects of dr_with_seg_len_pair_t
2822 so that we can combine aliasing checks in one scan. */
2824 static int
2825 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2827 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2828 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2830 const dr_with_seg_len &p11 = p1->first,
2831 &p12 = p1->second,
2832 &p21 = p2->first,
2833 &p22 = p2->second;
2835 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2836 if a and c have the same basic address snd step, and b and d have the same
2837 address and step. Therefore, if any a&c or b&d don't have the same address
2838 and step, we don't care the order of those two pairs after sorting. */
2839 int comp_res;
2841 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2842 DR_BASE_ADDRESS (p21.dr))) != 0)
2843 return comp_res;
2844 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2845 DR_BASE_ADDRESS (p22.dr))) != 0)
2846 return comp_res;
2847 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2848 return comp_res;
2849 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2850 return comp_res;
2851 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2852 return comp_res;
2853 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2854 return comp_res;
2856 return 0;
2859 /* Function vect_vfa_segment_size.
2861 Create an expression that computes the size of segment
2862 that will be accessed for a data reference. The functions takes into
2863 account that realignment loads may access one more vector.
2865 Input:
2866 DR: The data reference.
2867 LENGTH_FACTOR: segment length to consider.
2869 Return an expression whose value is the size of segment which will be
2870 accessed by DR. */
2872 static tree
2873 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2875 tree segment_length;
2877 if (integer_zerop (DR_STEP (dr)))
2878 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2879 else
2880 segment_length = size_binop (MULT_EXPR,
2881 fold_convert (sizetype, DR_STEP (dr)),
2882 fold_convert (sizetype, length_factor));
2884 if (vect_supportable_dr_alignment (dr, false)
2885 == dr_explicit_realign_optimized)
2887 tree vector_size = TYPE_SIZE_UNIT
2888 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2890 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2892 return segment_length;
2895 /* Function vect_prune_runtime_alias_test_list.
2897 Prune a list of ddrs to be tested at run-time by versioning for alias.
2898 Merge several alias checks into one if possible.
2899 Return FALSE if resulting list of ddrs is longer then allowed by
2900 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2902 bool
2903 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2905 vec<ddr_p> may_alias_ddrs =
2906 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2907 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2908 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2909 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2910 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2912 ddr_p ddr;
2913 unsigned int i;
2914 tree length_factor;
2916 if (dump_enabled_p ())
2917 dump_printf_loc (MSG_NOTE, vect_location,
2918 "=== vect_prune_runtime_alias_test_list ===\n");
2920 if (may_alias_ddrs.is_empty ())
2921 return true;
2923 /* Basically, for each pair of dependent data refs store_ptr_0
2924 and load_ptr_0, we create an expression:
2926 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2927 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2929 for aliasing checks. However, in some cases we can decrease
2930 the number of checks by combining two checks into one. For
2931 example, suppose we have another pair of data refs store_ptr_0
2932 and load_ptr_1, and if the following condition is satisfied:
2934 load_ptr_0 < load_ptr_1 &&
2935 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2937 (this condition means, in each iteration of vectorized loop,
2938 the accessed memory of store_ptr_0 cannot be between the memory
2939 of load_ptr_0 and load_ptr_1.)
2941 we then can use only the following expression to finish the
2942 alising checks between store_ptr_0 & load_ptr_0 and
2943 store_ptr_0 & load_ptr_1:
2945 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2946 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2948 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2949 same basic address. */
2951 comp_alias_ddrs.create (may_alias_ddrs.length ());
2953 /* First, we collect all data ref pairs for aliasing checks. */
2954 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2956 struct data_reference *dr_a, *dr_b;
2957 gimple *dr_group_first_a, *dr_group_first_b;
2958 tree segment_length_a, segment_length_b;
2959 gimple *stmt_a, *stmt_b;
2961 dr_a = DDR_A (ddr);
2962 stmt_a = DR_STMT (DDR_A (ddr));
2963 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2964 if (dr_group_first_a)
2966 stmt_a = dr_group_first_a;
2967 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2970 dr_b = DDR_B (ddr);
2971 stmt_b = DR_STMT (DDR_B (ddr));
2972 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2973 if (dr_group_first_b)
2975 stmt_b = dr_group_first_b;
2976 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2979 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2980 length_factor = scalar_loop_iters;
2981 else
2982 length_factor = size_int (vect_factor);
2983 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2984 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2986 dr_with_seg_len_pair_t dr_with_seg_len_pair
2987 (dr_with_seg_len (dr_a, segment_length_a),
2988 dr_with_seg_len (dr_b, segment_length_b));
2990 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2991 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2993 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2996 /* Second, we sort the collected data ref pairs so that we can scan
2997 them once to combine all possible aliasing checks. */
2998 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3000 /* Third, we scan the sorted dr pairs and check if we can combine
3001 alias checks of two neighbouring dr pairs. */
3002 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3004 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3005 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3006 *dr_b1 = &comp_alias_ddrs[i-1].second,
3007 *dr_a2 = &comp_alias_ddrs[i].first,
3008 *dr_b2 = &comp_alias_ddrs[i].second;
3010 /* Remove duplicate data ref pairs. */
3011 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3013 if (dump_enabled_p ())
3015 dump_printf_loc (MSG_NOTE, vect_location,
3016 "found equal ranges ");
3017 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3018 DR_REF (dr_a1->dr));
3019 dump_printf (MSG_NOTE, ", ");
3020 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3021 DR_REF (dr_b1->dr));
3022 dump_printf (MSG_NOTE, " and ");
3023 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3024 DR_REF (dr_a2->dr));
3025 dump_printf (MSG_NOTE, ", ");
3026 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3027 DR_REF (dr_b2->dr));
3028 dump_printf (MSG_NOTE, "\n");
3031 comp_alias_ddrs.ordered_remove (i--);
3032 continue;
3035 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3037 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3038 and DR_A1 and DR_A2 are two consecutive memrefs. */
3039 if (*dr_a1 == *dr_a2)
3041 std::swap (dr_a1, dr_b1);
3042 std::swap (dr_a2, dr_b2);
3045 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3046 DR_BASE_ADDRESS (dr_a2->dr),
3048 || !tree_fits_shwi_p (dr_a1->offset)
3049 || !tree_fits_shwi_p (dr_a2->offset))
3050 continue;
3052 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
3053 - tree_to_shwi (dr_a1->offset));
3056 /* Now we check if the following condition is satisfied:
3058 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3060 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3061 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3062 have to make a best estimation. We can get the minimum value
3063 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3064 then either of the following two conditions can guarantee the
3065 one above:
3067 1: DIFF <= MIN_SEG_LEN_B
3068 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3072 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
3073 ? tree_to_shwi (dr_b1->seg_len)
3074 : vect_factor);
3076 if (diff <= min_seg_len_b
3077 || (tree_fits_shwi_p (dr_a1->seg_len)
3078 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
3080 if (dump_enabled_p ())
3082 dump_printf_loc (MSG_NOTE, vect_location,
3083 "merging ranges for ");
3084 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3085 DR_REF (dr_a1->dr));
3086 dump_printf (MSG_NOTE, ", ");
3087 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3088 DR_REF (dr_b1->dr));
3089 dump_printf (MSG_NOTE, " and ");
3090 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3091 DR_REF (dr_a2->dr));
3092 dump_printf (MSG_NOTE, ", ");
3093 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3094 DR_REF (dr_b2->dr));
3095 dump_printf (MSG_NOTE, "\n");
3098 dr_a1->seg_len = size_binop (PLUS_EXPR,
3099 dr_a2->seg_len, size_int (diff));
3100 comp_alias_ddrs.ordered_remove (i--);
3105 dump_printf_loc (MSG_NOTE, vect_location,
3106 "improved number of alias checks from %d to %d\n",
3107 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3108 if ((int) comp_alias_ddrs.length () >
3109 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3110 return false;
3112 return true;
3115 /* Check whether a non-affine read or write in stmt is suitable for gather load
3116 or scatter store and if so, return a builtin decl for that operation. */
3118 tree
3119 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3120 tree *offp, int *scalep)
3122 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3124 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3125 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3126 tree offtype = NULL_TREE;
3127 tree decl, base, off;
3128 machine_mode pmode;
3129 int punsignedp, reversep, pvolatilep = 0;
3131 base = DR_REF (dr);
3132 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3133 see if we can use the def stmt of the address. */
3134 if (is_gimple_call (stmt)
3135 && gimple_call_internal_p (stmt)
3136 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3137 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3138 && TREE_CODE (base) == MEM_REF
3139 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3140 && integer_zerop (TREE_OPERAND (base, 1))
3141 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3143 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3144 if (is_gimple_assign (def_stmt)
3145 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3146 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3149 /* The gather and scatter builtins need address of the form
3150 loop_invariant + vector * {1, 2, 4, 8}
3152 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3153 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3154 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3155 multiplications and additions in it. To get a vector, we need
3156 a single SSA_NAME that will be defined in the loop and will
3157 contain everything that is not loop invariant and that can be
3158 vectorized. The following code attempts to find such a preexistng
3159 SSA_NAME OFF and put the loop invariants into a tree BASE
3160 that can be gimplified before the loop. */
3161 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3162 &punsignedp, &reversep, &pvolatilep, false);
3163 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3165 if (TREE_CODE (base) == MEM_REF)
3167 if (!integer_zerop (TREE_OPERAND (base, 1)))
3169 if (off == NULL_TREE)
3171 offset_int moff = mem_ref_offset (base);
3172 off = wide_int_to_tree (sizetype, moff);
3174 else
3175 off = size_binop (PLUS_EXPR, off,
3176 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3178 base = TREE_OPERAND (base, 0);
3180 else
3181 base = build_fold_addr_expr (base);
3183 if (off == NULL_TREE)
3184 off = size_zero_node;
3186 /* If base is not loop invariant, either off is 0, then we start with just
3187 the constant offset in the loop invariant BASE and continue with base
3188 as OFF, otherwise give up.
3189 We could handle that case by gimplifying the addition of base + off
3190 into some SSA_NAME and use that as off, but for now punt. */
3191 if (!expr_invariant_in_loop_p (loop, base))
3193 if (!integer_zerop (off))
3194 return NULL_TREE;
3195 off = base;
3196 base = size_int (pbitpos / BITS_PER_UNIT);
3198 /* Otherwise put base + constant offset into the loop invariant BASE
3199 and continue with OFF. */
3200 else
3202 base = fold_convert (sizetype, base);
3203 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3206 /* OFF at this point may be either a SSA_NAME or some tree expression
3207 from get_inner_reference. Try to peel off loop invariants from it
3208 into BASE as long as possible. */
3209 STRIP_NOPS (off);
3210 while (offtype == NULL_TREE)
3212 enum tree_code code;
3213 tree op0, op1, add = NULL_TREE;
3215 if (TREE_CODE (off) == SSA_NAME)
3217 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3219 if (expr_invariant_in_loop_p (loop, off))
3220 return NULL_TREE;
3222 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3223 break;
3225 op0 = gimple_assign_rhs1 (def_stmt);
3226 code = gimple_assign_rhs_code (def_stmt);
3227 op1 = gimple_assign_rhs2 (def_stmt);
3229 else
3231 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3232 return NULL_TREE;
3233 code = TREE_CODE (off);
3234 extract_ops_from_tree (off, &code, &op0, &op1);
3236 switch (code)
3238 case POINTER_PLUS_EXPR:
3239 case PLUS_EXPR:
3240 if (expr_invariant_in_loop_p (loop, op0))
3242 add = op0;
3243 off = op1;
3244 do_add:
3245 add = fold_convert (sizetype, add);
3246 if (scale != 1)
3247 add = size_binop (MULT_EXPR, add, size_int (scale));
3248 base = size_binop (PLUS_EXPR, base, add);
3249 continue;
3251 if (expr_invariant_in_loop_p (loop, op1))
3253 add = op1;
3254 off = op0;
3255 goto do_add;
3257 break;
3258 case MINUS_EXPR:
3259 if (expr_invariant_in_loop_p (loop, op1))
3261 add = fold_convert (sizetype, op1);
3262 add = size_binop (MINUS_EXPR, size_zero_node, add);
3263 off = op0;
3264 goto do_add;
3266 break;
3267 case MULT_EXPR:
3268 if (scale == 1 && tree_fits_shwi_p (op1))
3270 scale = tree_to_shwi (op1);
3271 off = op0;
3272 continue;
3274 break;
3275 case SSA_NAME:
3276 off = op0;
3277 continue;
3278 CASE_CONVERT:
3279 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3280 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3281 break;
3282 if (TYPE_PRECISION (TREE_TYPE (op0))
3283 == TYPE_PRECISION (TREE_TYPE (off)))
3285 off = op0;
3286 continue;
3288 if (TYPE_PRECISION (TREE_TYPE (op0))
3289 < TYPE_PRECISION (TREE_TYPE (off)))
3291 off = op0;
3292 offtype = TREE_TYPE (off);
3293 STRIP_NOPS (off);
3294 continue;
3296 break;
3297 default:
3298 break;
3300 break;
3303 /* If at the end OFF still isn't a SSA_NAME or isn't
3304 defined in the loop, punt. */
3305 if (TREE_CODE (off) != SSA_NAME
3306 || expr_invariant_in_loop_p (loop, off))
3307 return NULL_TREE;
3309 if (offtype == NULL_TREE)
3310 offtype = TREE_TYPE (off);
3312 if (DR_IS_READ (dr))
3313 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3314 offtype, scale);
3315 else
3316 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3317 offtype, scale);
3319 if (decl == NULL_TREE)
3320 return NULL_TREE;
3322 if (basep)
3323 *basep = base;
3324 if (offp)
3325 *offp = off;
3326 if (scalep)
3327 *scalep = scale;
3328 return decl;
3331 /* Function vect_analyze_data_refs.
3333 Find all the data references in the loop or basic block.
3335 The general structure of the analysis of data refs in the vectorizer is as
3336 follows:
3337 1- vect_analyze_data_refs(loop/bb): call
3338 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3339 in the loop/bb and their dependences.
3340 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3341 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3342 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3346 bool
3347 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3349 struct loop *loop = NULL;
3350 unsigned int i;
3351 struct data_reference *dr;
3352 tree scalar_type;
3354 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_NOTE, vect_location,
3356 "=== vect_analyze_data_refs ===\n");
3358 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3359 loop = LOOP_VINFO_LOOP (loop_vinfo);
3361 /* Go through the data-refs, check that the analysis succeeded. Update
3362 pointer from stmt_vec_info struct to DR and vectype. */
3364 vec<data_reference_p> datarefs = vinfo->datarefs;
3365 FOR_EACH_VEC_ELT (datarefs, i, dr)
3367 gimple *stmt;
3368 stmt_vec_info stmt_info;
3369 tree base, offset, init;
3370 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3371 bool simd_lane_access = false;
3372 int vf;
3374 again:
3375 if (!dr || !DR_REF (dr))
3377 if (dump_enabled_p ())
3378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3379 "not vectorized: unhandled data-ref\n");
3380 return false;
3383 stmt = DR_STMT (dr);
3384 stmt_info = vinfo_for_stmt (stmt);
3386 /* Discard clobbers from the dataref vector. We will remove
3387 clobber stmts during vectorization. */
3388 if (gimple_clobber_p (stmt))
3390 free_data_ref (dr);
3391 if (i == datarefs.length () - 1)
3393 datarefs.pop ();
3394 break;
3396 datarefs.ordered_remove (i);
3397 dr = datarefs[i];
3398 goto again;
3401 /* Check that analysis of the data-ref succeeded. */
3402 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3403 || !DR_STEP (dr))
3405 bool maybe_gather
3406 = DR_IS_READ (dr)
3407 && !TREE_THIS_VOLATILE (DR_REF (dr))
3408 && targetm.vectorize.builtin_gather != NULL;
3409 bool maybe_scatter
3410 = DR_IS_WRITE (dr)
3411 && !TREE_THIS_VOLATILE (DR_REF (dr))
3412 && targetm.vectorize.builtin_scatter != NULL;
3413 bool maybe_simd_lane_access
3414 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3416 /* If target supports vector gather loads or scatter stores, or if
3417 this might be a SIMD lane access, see if they can't be used. */
3418 if (is_a <loop_vec_info> (vinfo)
3419 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3420 && !nested_in_vect_loop_p (loop, stmt))
3422 struct data_reference *newdr
3423 = create_data_ref (NULL, loop_containing_stmt (stmt),
3424 DR_REF (dr), stmt, maybe_scatter ? false : true);
3425 gcc_assert (newdr != NULL && DR_REF (newdr));
3426 if (DR_BASE_ADDRESS (newdr)
3427 && DR_OFFSET (newdr)
3428 && DR_INIT (newdr)
3429 && DR_STEP (newdr)
3430 && integer_zerop (DR_STEP (newdr)))
3432 if (maybe_simd_lane_access)
3434 tree off = DR_OFFSET (newdr);
3435 STRIP_NOPS (off);
3436 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3437 && TREE_CODE (off) == MULT_EXPR
3438 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3440 tree step = TREE_OPERAND (off, 1);
3441 off = TREE_OPERAND (off, 0);
3442 STRIP_NOPS (off);
3443 if (CONVERT_EXPR_P (off)
3444 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3445 0)))
3446 < TYPE_PRECISION (TREE_TYPE (off)))
3447 off = TREE_OPERAND (off, 0);
3448 if (TREE_CODE (off) == SSA_NAME)
3450 gimple *def = SSA_NAME_DEF_STMT (off);
3451 tree reft = TREE_TYPE (DR_REF (newdr));
3452 if (is_gimple_call (def)
3453 && gimple_call_internal_p (def)
3454 && (gimple_call_internal_fn (def)
3455 == IFN_GOMP_SIMD_LANE))
3457 tree arg = gimple_call_arg (def, 0);
3458 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3459 arg = SSA_NAME_VAR (arg);
3460 if (arg == loop->simduid
3461 /* For now. */
3462 && tree_int_cst_equal
3463 (TYPE_SIZE_UNIT (reft),
3464 step))
3466 DR_OFFSET (newdr) = ssize_int (0);
3467 DR_STEP (newdr) = step;
3468 DR_ALIGNED_TO (newdr)
3469 = size_int (BIGGEST_ALIGNMENT);
3470 dr = newdr;
3471 simd_lane_access = true;
3477 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3479 dr = newdr;
3480 if (maybe_gather)
3481 gatherscatter = GATHER;
3482 else
3483 gatherscatter = SCATTER;
3486 if (gatherscatter == SG_NONE && !simd_lane_access)
3487 free_data_ref (newdr);
3490 if (gatherscatter == SG_NONE && !simd_lane_access)
3492 if (dump_enabled_p ())
3494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3495 "not vectorized: data ref analysis "
3496 "failed ");
3497 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3498 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3501 if (is_a <bb_vec_info> (vinfo))
3502 break;
3504 return false;
3508 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3510 if (dump_enabled_p ())
3511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3512 "not vectorized: base addr of dr is a "
3513 "constant\n");
3515 if (is_a <bb_vec_info> (vinfo))
3516 break;
3518 if (gatherscatter != SG_NONE || simd_lane_access)
3519 free_data_ref (dr);
3520 return false;
3523 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3525 if (dump_enabled_p ())
3527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3528 "not vectorized: volatile type ");
3529 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3530 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3533 if (is_a <bb_vec_info> (vinfo))
3534 break;
3536 return false;
3539 if (stmt_can_throw_internal (stmt))
3541 if (dump_enabled_p ())
3543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3544 "not vectorized: statement can throw an "
3545 "exception ");
3546 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3547 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3550 if (is_a <bb_vec_info> (vinfo))
3551 break;
3553 if (gatherscatter != SG_NONE || simd_lane_access)
3554 free_data_ref (dr);
3555 return false;
3558 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3559 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3561 if (dump_enabled_p ())
3563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3564 "not vectorized: statement is bitfield "
3565 "access ");
3566 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3567 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3570 if (is_a <bb_vec_info> (vinfo))
3571 break;
3573 if (gatherscatter != SG_NONE || simd_lane_access)
3574 free_data_ref (dr);
3575 return false;
3578 base = unshare_expr (DR_BASE_ADDRESS (dr));
3579 offset = unshare_expr (DR_OFFSET (dr));
3580 init = unshare_expr (DR_INIT (dr));
3582 if (is_gimple_call (stmt)
3583 && (!gimple_call_internal_p (stmt)
3584 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3585 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3587 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 "not vectorized: dr in a call ");
3591 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3592 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3595 if (is_a <bb_vec_info> (vinfo))
3596 break;
3598 if (gatherscatter != SG_NONE || simd_lane_access)
3599 free_data_ref (dr);
3600 return false;
3603 /* Update DR field in stmt_vec_info struct. */
3605 /* If the dataref is in an inner-loop of the loop that is considered for
3606 for vectorization, we also want to analyze the access relative to
3607 the outer-loop (DR contains information only relative to the
3608 inner-most enclosing loop). We do that by building a reference to the
3609 first location accessed by the inner-loop, and analyze it relative to
3610 the outer-loop. */
3611 if (loop && nested_in_vect_loop_p (loop, stmt))
3613 tree outer_step, outer_base, outer_init;
3614 HOST_WIDE_INT pbitsize, pbitpos;
3615 tree poffset;
3616 machine_mode pmode;
3617 int punsignedp, preversep, pvolatilep;
3618 affine_iv base_iv, offset_iv;
3619 tree dinit;
3621 /* Build a reference to the first location accessed by the
3622 inner-loop: *(BASE+INIT). (The first location is actually
3623 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3624 tree inner_base = build_fold_indirect_ref
3625 (fold_build_pointer_plus (base, init));
3627 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_NOTE, vect_location,
3630 "analyze in outer-loop: ");
3631 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3632 dump_printf (MSG_NOTE, "\n");
3635 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3636 &poffset, &pmode, &punsignedp,
3637 &preversep, &pvolatilep, false);
3638 gcc_assert (outer_base != NULL_TREE);
3640 if (pbitpos % BITS_PER_UNIT != 0)
3642 if (dump_enabled_p ())
3643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3644 "failed: bit offset alignment.\n");
3645 return false;
3648 if (preversep)
3650 if (dump_enabled_p ())
3651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3652 "failed: reverse storage order.\n");
3653 return false;
3656 outer_base = build_fold_addr_expr (outer_base);
3657 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3658 &base_iv, false))
3660 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3662 "failed: evolution of base is not affine.\n");
3663 return false;
3666 if (offset)
3668 if (poffset)
3669 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3670 poffset);
3671 else
3672 poffset = offset;
3675 if (!poffset)
3677 offset_iv.base = ssize_int (0);
3678 offset_iv.step = ssize_int (0);
3680 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3681 &offset_iv, false))
3683 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3685 "evolution of offset is not affine.\n");
3686 return false;
3689 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3690 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3691 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3692 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3693 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3695 outer_step = size_binop (PLUS_EXPR,
3696 fold_convert (ssizetype, base_iv.step),
3697 fold_convert (ssizetype, offset_iv.step));
3699 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3700 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3701 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3702 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3703 STMT_VINFO_DR_OFFSET (stmt_info) =
3704 fold_convert (ssizetype, offset_iv.base);
3705 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3706 size_int (highest_pow2_factor (offset_iv.base));
3708 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_NOTE, vect_location,
3711 "\touter base_address: ");
3712 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3713 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3714 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3715 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3716 STMT_VINFO_DR_OFFSET (stmt_info));
3717 dump_printf (MSG_NOTE,
3718 "\n\touter constant offset from base address: ");
3719 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3720 STMT_VINFO_DR_INIT (stmt_info));
3721 dump_printf (MSG_NOTE, "\n\touter step: ");
3722 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3723 STMT_VINFO_DR_STEP (stmt_info));
3724 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3725 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3726 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3727 dump_printf (MSG_NOTE, "\n");
3731 if (STMT_VINFO_DATA_REF (stmt_info))
3733 if (dump_enabled_p ())
3735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3736 "not vectorized: more than one data ref "
3737 "in stmt: ");
3738 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3739 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3742 if (is_a <bb_vec_info> (vinfo))
3743 break;
3745 if (gatherscatter != SG_NONE || simd_lane_access)
3746 free_data_ref (dr);
3747 return false;
3750 STMT_VINFO_DATA_REF (stmt_info) = dr;
3751 if (simd_lane_access)
3753 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3754 free_data_ref (datarefs[i]);
3755 datarefs[i] = dr;
3758 /* Set vectype for STMT. */
3759 scalar_type = TREE_TYPE (DR_REF (dr));
3760 STMT_VINFO_VECTYPE (stmt_info)
3761 = get_vectype_for_scalar_type (scalar_type);
3762 if (!STMT_VINFO_VECTYPE (stmt_info))
3764 if (dump_enabled_p ())
3766 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3767 "not vectorized: no vectype for stmt: ");
3768 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3769 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3770 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3771 scalar_type);
3772 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3775 if (is_a <bb_vec_info> (vinfo))
3777 /* No vector type is fine, the ref can still participate
3778 in dependence analysis, we just can't vectorize it. */
3779 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3780 continue;
3783 if (gatherscatter != SG_NONE || simd_lane_access)
3785 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3786 if (gatherscatter != SG_NONE)
3787 free_data_ref (dr);
3789 return false;
3791 else
3793 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 "got vectype for stmt: ");
3797 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3798 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3799 STMT_VINFO_VECTYPE (stmt_info));
3800 dump_printf (MSG_NOTE, "\n");
3804 /* Adjust the minimal vectorization factor according to the
3805 vector type. */
3806 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3807 if (vf > *min_vf)
3808 *min_vf = vf;
3810 if (gatherscatter != SG_NONE)
3812 tree off;
3813 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3814 NULL, &off, NULL)
3815 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3817 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3818 free_data_ref (dr);
3819 if (dump_enabled_p ())
3821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3822 (gatherscatter == GATHER) ?
3823 "not vectorized: not suitable for gather "
3824 "load " :
3825 "not vectorized: not suitable for scatter "
3826 "store ");
3827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3828 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3830 return false;
3833 datarefs[i] = dr;
3834 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3837 else if (is_a <loop_vec_info> (vinfo)
3838 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3840 if (nested_in_vect_loop_p (loop, stmt))
3842 if (dump_enabled_p ())
3844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3845 "not vectorized: not suitable for strided "
3846 "load ");
3847 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3848 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3850 return false;
3852 STMT_VINFO_STRIDED_P (stmt_info) = true;
3856 /* If we stopped analysis at the first dataref we could not analyze
3857 when trying to vectorize a basic-block mark the rest of the datarefs
3858 as not vectorizable and truncate the vector of datarefs. That
3859 avoids spending useless time in analyzing their dependence. */
3860 if (i != datarefs.length ())
3862 gcc_assert (is_a <bb_vec_info> (vinfo));
3863 for (unsigned j = i; j < datarefs.length (); ++j)
3865 data_reference_p dr = datarefs[j];
3866 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3867 free_data_ref (dr);
3869 datarefs.truncate (i);
3872 return true;
3876 /* Function vect_get_new_vect_var.
3878 Returns a name for a new variable. The current naming scheme appends the
3879 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3880 the name of vectorizer generated variables, and appends that to NAME if
3881 provided. */
3883 tree
3884 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3886 const char *prefix;
3887 tree new_vect_var;
3889 switch (var_kind)
3891 case vect_simple_var:
3892 prefix = "vect";
3893 break;
3894 case vect_scalar_var:
3895 prefix = "stmp";
3896 break;
3897 case vect_mask_var:
3898 prefix = "mask";
3899 break;
3900 case vect_pointer_var:
3901 prefix = "vectp";
3902 break;
3903 default:
3904 gcc_unreachable ();
3907 if (name)
3909 char* tmp = concat (prefix, "_", name, NULL);
3910 new_vect_var = create_tmp_reg (type, tmp);
3911 free (tmp);
3913 else
3914 new_vect_var = create_tmp_reg (type, prefix);
3916 return new_vect_var;
3919 /* Like vect_get_new_vect_var but return an SSA name. */
3921 tree
3922 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3924 const char *prefix;
3925 tree new_vect_var;
3927 switch (var_kind)
3929 case vect_simple_var:
3930 prefix = "vect";
3931 break;
3932 case vect_scalar_var:
3933 prefix = "stmp";
3934 break;
3935 case vect_pointer_var:
3936 prefix = "vectp";
3937 break;
3938 default:
3939 gcc_unreachable ();
3942 if (name)
3944 char* tmp = concat (prefix, "_", name, NULL);
3945 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3946 free (tmp);
3948 else
3949 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3951 return new_vect_var;
3954 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3956 static void
3957 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3958 stmt_vec_info stmt_info)
3960 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3961 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3962 int misalign = DR_MISALIGNMENT (dr);
3963 if (misalign == -1)
3964 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3965 else
3966 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3969 /* Function vect_create_addr_base_for_vector_ref.
3971 Create an expression that computes the address of the first memory location
3972 that will be accessed for a data reference.
3974 Input:
3975 STMT: The statement containing the data reference.
3976 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3977 OFFSET: Optional. If supplied, it is be added to the initial address.
3978 LOOP: Specify relative to which loop-nest should the address be computed.
3979 For example, when the dataref is in an inner-loop nested in an
3980 outer-loop that is now being vectorized, LOOP can be either the
3981 outer-loop, or the inner-loop. The first memory location accessed
3982 by the following dataref ('in' points to short):
3984 for (i=0; i<N; i++)
3985 for (j=0; j<M; j++)
3986 s += in[i+j]
3988 is as follows:
3989 if LOOP=i_loop: &in (relative to i_loop)
3990 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3991 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3992 initial address. Unlike OFFSET, which is number of elements to
3993 be added, BYTE_OFFSET is measured in bytes.
3995 Output:
3996 1. Return an SSA_NAME whose value is the address of the memory location of
3997 the first vector of the data reference.
3998 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3999 these statement(s) which define the returned SSA_NAME.
4001 FORNOW: We are only handling array accesses with step 1. */
4003 tree
4004 vect_create_addr_base_for_vector_ref (gimple *stmt,
4005 gimple_seq *new_stmt_list,
4006 tree offset,
4007 struct loop *loop,
4008 tree byte_offset)
4010 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4011 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4012 tree data_ref_base;
4013 const char *base_name;
4014 tree addr_base;
4015 tree dest;
4016 gimple_seq seq = NULL;
4017 tree base_offset;
4018 tree init;
4019 tree vect_ptr_type;
4020 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4021 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4023 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4025 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4027 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4029 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4030 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4031 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4033 else
4035 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4036 base_offset = unshare_expr (DR_OFFSET (dr));
4037 init = unshare_expr (DR_INIT (dr));
4040 if (loop_vinfo)
4041 base_name = get_name (data_ref_base);
4042 else
4044 base_offset = ssize_int (0);
4045 init = ssize_int (0);
4046 base_name = get_name (DR_REF (dr));
4049 /* Create base_offset */
4050 base_offset = size_binop (PLUS_EXPR,
4051 fold_convert (sizetype, base_offset),
4052 fold_convert (sizetype, init));
4054 if (offset)
4056 offset = fold_build2 (MULT_EXPR, sizetype,
4057 fold_convert (sizetype, offset), step);
4058 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4059 base_offset, offset);
4061 if (byte_offset)
4063 byte_offset = fold_convert (sizetype, byte_offset);
4064 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4065 base_offset, byte_offset);
4068 /* base + base_offset */
4069 if (loop_vinfo)
4070 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4071 else
4073 addr_base = build1 (ADDR_EXPR,
4074 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4075 unshare_expr (DR_REF (dr)));
4078 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4079 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4080 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4081 gimple_seq_add_seq (new_stmt_list, seq);
4083 if (DR_PTR_INFO (dr)
4084 && TREE_CODE (addr_base) == SSA_NAME
4085 && !SSA_NAME_PTR_INFO (addr_base))
4087 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4088 if (offset || byte_offset)
4089 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4092 if (dump_enabled_p ())
4094 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4095 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4096 dump_printf (MSG_NOTE, "\n");
4099 return addr_base;
4103 /* Function vect_create_data_ref_ptr.
4105 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4106 location accessed in the loop by STMT, along with the def-use update
4107 chain to appropriately advance the pointer through the loop iterations.
4108 Also set aliasing information for the pointer. This pointer is used by
4109 the callers to this function to create a memory reference expression for
4110 vector load/store access.
4112 Input:
4113 1. STMT: a stmt that references memory. Expected to be of the form
4114 GIMPLE_ASSIGN <name, data-ref> or
4115 GIMPLE_ASSIGN <data-ref, name>.
4116 2. AGGR_TYPE: the type of the reference, which should be either a vector
4117 or an array.
4118 3. AT_LOOP: the loop where the vector memref is to be created.
4119 4. OFFSET (optional): an offset to be added to the initial address accessed
4120 by the data-ref in STMT.
4121 5. BSI: location where the new stmts are to be placed if there is no loop
4122 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4123 pointing to the initial address.
4124 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4125 to the initial address accessed by the data-ref in STMT. This is
4126 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4127 in bytes.
4129 Output:
4130 1. Declare a new ptr to vector_type, and have it point to the base of the
4131 data reference (initial addressed accessed by the data reference).
4132 For example, for vector of type V8HI, the following code is generated:
4134 v8hi *ap;
4135 ap = (v8hi *)initial_address;
4137 if OFFSET is not supplied:
4138 initial_address = &a[init];
4139 if OFFSET is supplied:
4140 initial_address = &a[init + OFFSET];
4141 if BYTE_OFFSET is supplied:
4142 initial_address = &a[init] + BYTE_OFFSET;
4144 Return the initial_address in INITIAL_ADDRESS.
4146 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4147 update the pointer in each iteration of the loop.
4149 Return the increment stmt that updates the pointer in PTR_INCR.
4151 3. Set INV_P to true if the access pattern of the data reference in the
4152 vectorized loop is invariant. Set it to false otherwise.
4154 4. Return the pointer. */
4156 tree
4157 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4158 tree offset, tree *initial_address,
4159 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4160 bool only_init, bool *inv_p, tree byte_offset)
4162 const char *base_name;
4163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4164 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4165 struct loop *loop = NULL;
4166 bool nested_in_vect_loop = false;
4167 struct loop *containing_loop = NULL;
4168 tree aggr_ptr_type;
4169 tree aggr_ptr;
4170 tree new_temp;
4171 gimple_seq new_stmt_list = NULL;
4172 edge pe = NULL;
4173 basic_block new_bb;
4174 tree aggr_ptr_init;
4175 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4176 tree aptr;
4177 gimple_stmt_iterator incr_gsi;
4178 bool insert_after;
4179 tree indx_before_incr, indx_after_incr;
4180 gimple *incr;
4181 tree step;
4182 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4184 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4185 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4187 if (loop_vinfo)
4189 loop = LOOP_VINFO_LOOP (loop_vinfo);
4190 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4191 containing_loop = (gimple_bb (stmt))->loop_father;
4192 pe = loop_preheader_edge (loop);
4194 else
4196 gcc_assert (bb_vinfo);
4197 only_init = true;
4198 *ptr_incr = NULL;
4201 /* Check the step (evolution) of the load in LOOP, and record
4202 whether it's invariant. */
4203 if (nested_in_vect_loop)
4204 step = STMT_VINFO_DR_STEP (stmt_info);
4205 else
4206 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4208 if (integer_zerop (step))
4209 *inv_p = true;
4210 else
4211 *inv_p = false;
4213 /* Create an expression for the first address accessed by this load
4214 in LOOP. */
4215 base_name = get_name (DR_BASE_ADDRESS (dr));
4217 if (dump_enabled_p ())
4219 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4220 dump_printf_loc (MSG_NOTE, vect_location,
4221 "create %s-pointer variable to type: ",
4222 get_tree_code_name (TREE_CODE (aggr_type)));
4223 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4224 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4225 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4226 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4227 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4228 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4229 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4230 else
4231 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4232 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4233 dump_printf (MSG_NOTE, "\n");
4236 /* (1) Create the new aggregate-pointer variable.
4237 Vector and array types inherit the alias set of their component
4238 type by default so we need to use a ref-all pointer if the data
4239 reference does not conflict with the created aggregated data
4240 reference because it is not addressable. */
4241 bool need_ref_all = false;
4242 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4243 get_alias_set (DR_REF (dr))))
4244 need_ref_all = true;
4245 /* Likewise for any of the data references in the stmt group. */
4246 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4248 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4251 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4252 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4253 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4254 get_alias_set (DR_REF (sdr))))
4256 need_ref_all = true;
4257 break;
4259 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4261 while (orig_stmt);
4263 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4264 need_ref_all);
4265 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4268 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4269 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4270 def-use update cycles for the pointer: one relative to the outer-loop
4271 (LOOP), which is what steps (3) and (4) below do. The other is relative
4272 to the inner-loop (which is the inner-most loop containing the dataref),
4273 and this is done be step (5) below.
4275 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4276 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4277 redundant. Steps (3),(4) create the following:
4279 vp0 = &base_addr;
4280 LOOP: vp1 = phi(vp0,vp2)
4283 vp2 = vp1 + step
4284 goto LOOP
4286 If there is an inner-loop nested in loop, then step (5) will also be
4287 applied, and an additional update in the inner-loop will be created:
4289 vp0 = &base_addr;
4290 LOOP: vp1 = phi(vp0,vp2)
4292 inner: vp3 = phi(vp1,vp4)
4293 vp4 = vp3 + inner_step
4294 if () goto inner
4296 vp2 = vp1 + step
4297 if () goto LOOP */
4299 /* (2) Calculate the initial address of the aggregate-pointer, and set
4300 the aggregate-pointer to point to it before the loop. */
4302 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4304 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4305 offset, loop, byte_offset);
4306 if (new_stmt_list)
4308 if (pe)
4310 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4311 gcc_assert (!new_bb);
4313 else
4314 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4317 *initial_address = new_temp;
4318 aggr_ptr_init = new_temp;
4320 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4321 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4322 inner-loop nested in LOOP (during outer-loop vectorization). */
4324 /* No update in loop is required. */
4325 if (only_init && (!loop_vinfo || at_loop == loop))
4326 aptr = aggr_ptr_init;
4327 else
4329 /* The step of the aggregate pointer is the type size. */
4330 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4331 /* One exception to the above is when the scalar step of the load in
4332 LOOP is zero. In this case the step here is also zero. */
4333 if (*inv_p)
4334 iv_step = size_zero_node;
4335 else if (tree_int_cst_sgn (step) == -1)
4336 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4338 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4340 create_iv (aggr_ptr_init,
4341 fold_convert (aggr_ptr_type, iv_step),
4342 aggr_ptr, loop, &incr_gsi, insert_after,
4343 &indx_before_incr, &indx_after_incr);
4344 incr = gsi_stmt (incr_gsi);
4345 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4347 /* Copy the points-to information if it exists. */
4348 if (DR_PTR_INFO (dr))
4350 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4351 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4353 if (ptr_incr)
4354 *ptr_incr = incr;
4356 aptr = indx_before_incr;
4359 if (!nested_in_vect_loop || only_init)
4360 return aptr;
4363 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4364 nested in LOOP, if exists. */
4366 gcc_assert (nested_in_vect_loop);
4367 if (!only_init)
4369 standard_iv_increment_position (containing_loop, &incr_gsi,
4370 &insert_after);
4371 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4372 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4373 &indx_after_incr);
4374 incr = gsi_stmt (incr_gsi);
4375 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4377 /* Copy the points-to information if it exists. */
4378 if (DR_PTR_INFO (dr))
4380 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4381 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4383 if (ptr_incr)
4384 *ptr_incr = incr;
4386 return indx_before_incr;
4388 else
4389 gcc_unreachable ();
4393 /* Function bump_vector_ptr
4395 Increment a pointer (to a vector type) by vector-size. If requested,
4396 i.e. if PTR-INCR is given, then also connect the new increment stmt
4397 to the existing def-use update-chain of the pointer, by modifying
4398 the PTR_INCR as illustrated below:
4400 The pointer def-use update-chain before this function:
4401 DATAREF_PTR = phi (p_0, p_2)
4402 ....
4403 PTR_INCR: p_2 = DATAREF_PTR + step
4405 The pointer def-use update-chain after this function:
4406 DATAREF_PTR = phi (p_0, p_2)
4407 ....
4408 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4409 ....
4410 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4412 Input:
4413 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4414 in the loop.
4415 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4416 the loop. The increment amount across iterations is expected
4417 to be vector_size.
4418 BSI - location where the new update stmt is to be placed.
4419 STMT - the original scalar memory-access stmt that is being vectorized.
4420 BUMP - optional. The offset by which to bump the pointer. If not given,
4421 the offset is assumed to be vector_size.
4423 Output: Return NEW_DATAREF_PTR as illustrated above.
4427 tree
4428 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4429 gimple *stmt, tree bump)
4431 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4432 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4433 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4434 tree update = TYPE_SIZE_UNIT (vectype);
4435 gassign *incr_stmt;
4436 ssa_op_iter iter;
4437 use_operand_p use_p;
4438 tree new_dataref_ptr;
4440 if (bump)
4441 update = bump;
4443 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4444 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4445 else
4446 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4447 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4448 dataref_ptr, update);
4449 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4451 /* Copy the points-to information if it exists. */
4452 if (DR_PTR_INFO (dr))
4454 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4455 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4458 if (!ptr_incr)
4459 return new_dataref_ptr;
4461 /* Update the vector-pointer's cross-iteration increment. */
4462 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4464 tree use = USE_FROM_PTR (use_p);
4466 if (use == dataref_ptr)
4467 SET_USE (use_p, new_dataref_ptr);
4468 else
4469 gcc_assert (tree_int_cst_compare (use, update) == 0);
4472 return new_dataref_ptr;
4476 /* Function vect_create_destination_var.
4478 Create a new temporary of type VECTYPE. */
4480 tree
4481 vect_create_destination_var (tree scalar_dest, tree vectype)
4483 tree vec_dest;
4484 const char *name;
4485 char *new_name;
4486 tree type;
4487 enum vect_var_kind kind;
4489 kind = vectype
4490 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4491 ? vect_mask_var
4492 : vect_simple_var
4493 : vect_scalar_var;
4494 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4496 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4498 name = get_name (scalar_dest);
4499 if (name)
4500 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4501 else
4502 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4503 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4504 free (new_name);
4506 return vec_dest;
4509 /* Function vect_grouped_store_supported.
4511 Returns TRUE if interleave high and interleave low permutations
4512 are supported, and FALSE otherwise. */
4514 bool
4515 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4517 machine_mode mode = TYPE_MODE (vectype);
4519 /* vect_permute_store_chain requires the group size to be equal to 3 or
4520 be a power of two. */
4521 if (count != 3 && exact_log2 (count) == -1)
4523 if (dump_enabled_p ())
4524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4525 "the size of the group of accesses"
4526 " is not a power of 2 or not eqaul to 3\n");
4527 return false;
4530 /* Check that the permutation is supported. */
4531 if (VECTOR_MODE_P (mode))
4533 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4534 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4536 if (count == 3)
4538 unsigned int j0 = 0, j1 = 0, j2 = 0;
4539 unsigned int i, j;
4541 for (j = 0; j < 3; j++)
4543 int nelt0 = ((3 - j) * nelt) % 3;
4544 int nelt1 = ((3 - j) * nelt + 1) % 3;
4545 int nelt2 = ((3 - j) * nelt + 2) % 3;
4546 for (i = 0; i < nelt; i++)
4548 if (3 * i + nelt0 < nelt)
4549 sel[3 * i + nelt0] = j0++;
4550 if (3 * i + nelt1 < nelt)
4551 sel[3 * i + nelt1] = nelt + j1++;
4552 if (3 * i + nelt2 < nelt)
4553 sel[3 * i + nelt2] = 0;
4555 if (!can_vec_perm_p (mode, false, sel))
4557 if (dump_enabled_p ())
4558 dump_printf (MSG_MISSED_OPTIMIZATION,
4559 "permutaion op not supported by target.\n");
4560 return false;
4563 for (i = 0; i < nelt; i++)
4565 if (3 * i + nelt0 < nelt)
4566 sel[3 * i + nelt0] = 3 * i + nelt0;
4567 if (3 * i + nelt1 < nelt)
4568 sel[3 * i + nelt1] = 3 * i + nelt1;
4569 if (3 * i + nelt2 < nelt)
4570 sel[3 * i + nelt2] = nelt + j2++;
4572 if (!can_vec_perm_p (mode, false, sel))
4574 if (dump_enabled_p ())
4575 dump_printf (MSG_MISSED_OPTIMIZATION,
4576 "permutaion op not supported by target.\n");
4577 return false;
4580 return true;
4582 else
4584 /* If length is not equal to 3 then only power of 2 is supported. */
4585 gcc_assert (exact_log2 (count) != -1);
4587 for (i = 0; i < nelt / 2; i++)
4589 sel[i * 2] = i;
4590 sel[i * 2 + 1] = i + nelt;
4592 if (can_vec_perm_p (mode, false, sel))
4594 for (i = 0; i < nelt; i++)
4595 sel[i] += nelt / 2;
4596 if (can_vec_perm_p (mode, false, sel))
4597 return true;
4602 if (dump_enabled_p ())
4603 dump_printf (MSG_MISSED_OPTIMIZATION,
4604 "permutaion op not supported by target.\n");
4605 return false;
4609 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4610 type VECTYPE. */
4612 bool
4613 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4615 return vect_lanes_optab_supported_p ("vec_store_lanes",
4616 vec_store_lanes_optab,
4617 vectype, count);
4621 /* Function vect_permute_store_chain.
4623 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4624 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4625 the data correctly for the stores. Return the final references for stores
4626 in RESULT_CHAIN.
4628 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4629 The input is 4 vectors each containing 8 elements. We assign a number to
4630 each element, the input sequence is:
4632 1st vec: 0 1 2 3 4 5 6 7
4633 2nd vec: 8 9 10 11 12 13 14 15
4634 3rd vec: 16 17 18 19 20 21 22 23
4635 4th vec: 24 25 26 27 28 29 30 31
4637 The output sequence should be:
4639 1st vec: 0 8 16 24 1 9 17 25
4640 2nd vec: 2 10 18 26 3 11 19 27
4641 3rd vec: 4 12 20 28 5 13 21 30
4642 4th vec: 6 14 22 30 7 15 23 31
4644 i.e., we interleave the contents of the four vectors in their order.
4646 We use interleave_high/low instructions to create such output. The input of
4647 each interleave_high/low operation is two vectors:
4648 1st vec 2nd vec
4649 0 1 2 3 4 5 6 7
4650 the even elements of the result vector are obtained left-to-right from the
4651 high/low elements of the first vector. The odd elements of the result are
4652 obtained left-to-right from the high/low elements of the second vector.
4653 The output of interleave_high will be: 0 4 1 5
4654 and of interleave_low: 2 6 3 7
4657 The permutation is done in log LENGTH stages. In each stage interleave_high
4658 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4659 where the first argument is taken from the first half of DR_CHAIN and the
4660 second argument from it's second half.
4661 In our example,
4663 I1: interleave_high (1st vec, 3rd vec)
4664 I2: interleave_low (1st vec, 3rd vec)
4665 I3: interleave_high (2nd vec, 4th vec)
4666 I4: interleave_low (2nd vec, 4th vec)
4668 The output for the first stage is:
4670 I1: 0 16 1 17 2 18 3 19
4671 I2: 4 20 5 21 6 22 7 23
4672 I3: 8 24 9 25 10 26 11 27
4673 I4: 12 28 13 29 14 30 15 31
4675 The output of the second stage, i.e. the final result is:
4677 I1: 0 8 16 24 1 9 17 25
4678 I2: 2 10 18 26 3 11 19 27
4679 I3: 4 12 20 28 5 13 21 30
4680 I4: 6 14 22 30 7 15 23 31. */
4682 void
4683 vect_permute_store_chain (vec<tree> dr_chain,
4684 unsigned int length,
4685 gimple *stmt,
4686 gimple_stmt_iterator *gsi,
4687 vec<tree> *result_chain)
4689 tree vect1, vect2, high, low;
4690 gimple *perm_stmt;
4691 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4692 tree perm_mask_low, perm_mask_high;
4693 tree data_ref;
4694 tree perm3_mask_low, perm3_mask_high;
4695 unsigned int i, n, log_length = exact_log2 (length);
4696 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4697 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4699 result_chain->quick_grow (length);
4700 memcpy (result_chain->address (), dr_chain.address (),
4701 length * sizeof (tree));
4703 if (length == 3)
4705 unsigned int j0 = 0, j1 = 0, j2 = 0;
4707 for (j = 0; j < 3; j++)
4709 int nelt0 = ((3 - j) * nelt) % 3;
4710 int nelt1 = ((3 - j) * nelt + 1) % 3;
4711 int nelt2 = ((3 - j) * nelt + 2) % 3;
4713 for (i = 0; i < nelt; i++)
4715 if (3 * i + nelt0 < nelt)
4716 sel[3 * i + nelt0] = j0++;
4717 if (3 * i + nelt1 < nelt)
4718 sel[3 * i + nelt1] = nelt + j1++;
4719 if (3 * i + nelt2 < nelt)
4720 sel[3 * i + nelt2] = 0;
4722 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4724 for (i = 0; i < nelt; i++)
4726 if (3 * i + nelt0 < nelt)
4727 sel[3 * i + nelt0] = 3 * i + nelt0;
4728 if (3 * i + nelt1 < nelt)
4729 sel[3 * i + nelt1] = 3 * i + nelt1;
4730 if (3 * i + nelt2 < nelt)
4731 sel[3 * i + nelt2] = nelt + j2++;
4733 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4735 vect1 = dr_chain[0];
4736 vect2 = dr_chain[1];
4738 /* Create interleaving stmt:
4739 low = VEC_PERM_EXPR <vect1, vect2,
4740 {j, nelt, *, j + 1, nelt + j + 1, *,
4741 j + 2, nelt + j + 2, *, ...}> */
4742 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4743 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4744 vect2, perm3_mask_low);
4745 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4747 vect1 = data_ref;
4748 vect2 = dr_chain[2];
4749 /* Create interleaving stmt:
4750 low = VEC_PERM_EXPR <vect1, vect2,
4751 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4752 6, 7, nelt + j + 2, ...}> */
4753 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4754 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4755 vect2, perm3_mask_high);
4756 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4757 (*result_chain)[j] = data_ref;
4760 else
4762 /* If length is not equal to 3 then only power of 2 is supported. */
4763 gcc_assert (exact_log2 (length) != -1);
4765 for (i = 0, n = nelt / 2; i < n; i++)
4767 sel[i * 2] = i;
4768 sel[i * 2 + 1] = i + nelt;
4770 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4772 for (i = 0; i < nelt; i++)
4773 sel[i] += nelt / 2;
4774 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4776 for (i = 0, n = log_length; i < n; i++)
4778 for (j = 0; j < length/2; j++)
4780 vect1 = dr_chain[j];
4781 vect2 = dr_chain[j+length/2];
4783 /* Create interleaving stmt:
4784 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4785 ...}> */
4786 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4787 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4788 vect2, perm_mask_high);
4789 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4790 (*result_chain)[2*j] = high;
4792 /* Create interleaving stmt:
4793 low = VEC_PERM_EXPR <vect1, vect2,
4794 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4795 ...}> */
4796 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4797 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4798 vect2, perm_mask_low);
4799 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4800 (*result_chain)[2*j+1] = low;
4802 memcpy (dr_chain.address (), result_chain->address (),
4803 length * sizeof (tree));
4808 /* Function vect_setup_realignment
4810 This function is called when vectorizing an unaligned load using
4811 the dr_explicit_realign[_optimized] scheme.
4812 This function generates the following code at the loop prolog:
4814 p = initial_addr;
4815 x msq_init = *(floor(p)); # prolog load
4816 realignment_token = call target_builtin;
4817 loop:
4818 x msq = phi (msq_init, ---)
4820 The stmts marked with x are generated only for the case of
4821 dr_explicit_realign_optimized.
4823 The code above sets up a new (vector) pointer, pointing to the first
4824 location accessed by STMT, and a "floor-aligned" load using that pointer.
4825 It also generates code to compute the "realignment-token" (if the relevant
4826 target hook was defined), and creates a phi-node at the loop-header bb
4827 whose arguments are the result of the prolog-load (created by this
4828 function) and the result of a load that takes place in the loop (to be
4829 created by the caller to this function).
4831 For the case of dr_explicit_realign_optimized:
4832 The caller to this function uses the phi-result (msq) to create the
4833 realignment code inside the loop, and sets up the missing phi argument,
4834 as follows:
4835 loop:
4836 msq = phi (msq_init, lsq)
4837 lsq = *(floor(p')); # load in loop
4838 result = realign_load (msq, lsq, realignment_token);
4840 For the case of dr_explicit_realign:
4841 loop:
4842 msq = *(floor(p)); # load in loop
4843 p' = p + (VS-1);
4844 lsq = *(floor(p')); # load in loop
4845 result = realign_load (msq, lsq, realignment_token);
4847 Input:
4848 STMT - (scalar) load stmt to be vectorized. This load accesses
4849 a memory location that may be unaligned.
4850 BSI - place where new code is to be inserted.
4851 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4852 is used.
4854 Output:
4855 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4856 target hook, if defined.
4857 Return value - the result of the loop-header phi node. */
4859 tree
4860 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4861 tree *realignment_token,
4862 enum dr_alignment_support alignment_support_scheme,
4863 tree init_addr,
4864 struct loop **at_loop)
4866 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4867 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4868 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4869 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4870 struct loop *loop = NULL;
4871 edge pe = NULL;
4872 tree scalar_dest = gimple_assign_lhs (stmt);
4873 tree vec_dest;
4874 gimple *inc;
4875 tree ptr;
4876 tree data_ref;
4877 basic_block new_bb;
4878 tree msq_init = NULL_TREE;
4879 tree new_temp;
4880 gphi *phi_stmt;
4881 tree msq = NULL_TREE;
4882 gimple_seq stmts = NULL;
4883 bool inv_p;
4884 bool compute_in_loop = false;
4885 bool nested_in_vect_loop = false;
4886 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4887 struct loop *loop_for_initial_load = NULL;
4889 if (loop_vinfo)
4891 loop = LOOP_VINFO_LOOP (loop_vinfo);
4892 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4895 gcc_assert (alignment_support_scheme == dr_explicit_realign
4896 || alignment_support_scheme == dr_explicit_realign_optimized);
4898 /* We need to generate three things:
4899 1. the misalignment computation
4900 2. the extra vector load (for the optimized realignment scheme).
4901 3. the phi node for the two vectors from which the realignment is
4902 done (for the optimized realignment scheme). */
4904 /* 1. Determine where to generate the misalignment computation.
4906 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4907 calculation will be generated by this function, outside the loop (in the
4908 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4909 caller, inside the loop.
4911 Background: If the misalignment remains fixed throughout the iterations of
4912 the loop, then both realignment schemes are applicable, and also the
4913 misalignment computation can be done outside LOOP. This is because we are
4914 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4915 are a multiple of VS (the Vector Size), and therefore the misalignment in
4916 different vectorized LOOP iterations is always the same.
4917 The problem arises only if the memory access is in an inner-loop nested
4918 inside LOOP, which is now being vectorized using outer-loop vectorization.
4919 This is the only case when the misalignment of the memory access may not
4920 remain fixed throughout the iterations of the inner-loop (as explained in
4921 detail in vect_supportable_dr_alignment). In this case, not only is the
4922 optimized realignment scheme not applicable, but also the misalignment
4923 computation (and generation of the realignment token that is passed to
4924 REALIGN_LOAD) have to be done inside the loop.
4926 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4927 or not, which in turn determines if the misalignment is computed inside
4928 the inner-loop, or outside LOOP. */
4930 if (init_addr != NULL_TREE || !loop_vinfo)
4932 compute_in_loop = true;
4933 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4937 /* 2. Determine where to generate the extra vector load.
4939 For the optimized realignment scheme, instead of generating two vector
4940 loads in each iteration, we generate a single extra vector load in the
4941 preheader of the loop, and in each iteration reuse the result of the
4942 vector load from the previous iteration. In case the memory access is in
4943 an inner-loop nested inside LOOP, which is now being vectorized using
4944 outer-loop vectorization, we need to determine whether this initial vector
4945 load should be generated at the preheader of the inner-loop, or can be
4946 generated at the preheader of LOOP. If the memory access has no evolution
4947 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4948 to be generated inside LOOP (in the preheader of the inner-loop). */
4950 if (nested_in_vect_loop)
4952 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4953 bool invariant_in_outerloop =
4954 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4955 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4957 else
4958 loop_for_initial_load = loop;
4959 if (at_loop)
4960 *at_loop = loop_for_initial_load;
4962 if (loop_for_initial_load)
4963 pe = loop_preheader_edge (loop_for_initial_load);
4965 /* 3. For the case of the optimized realignment, create the first vector
4966 load at the loop preheader. */
4968 if (alignment_support_scheme == dr_explicit_realign_optimized)
4970 /* Create msq_init = *(floor(p1)) in the loop preheader */
4971 gassign *new_stmt;
4973 gcc_assert (!compute_in_loop);
4974 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4975 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4976 NULL_TREE, &init_addr, NULL, &inc,
4977 true, &inv_p);
4978 if (TREE_CODE (ptr) == SSA_NAME)
4979 new_temp = copy_ssa_name (ptr);
4980 else
4981 new_temp = make_ssa_name (TREE_TYPE (ptr));
4982 new_stmt = gimple_build_assign
4983 (new_temp, BIT_AND_EXPR, ptr,
4984 build_int_cst (TREE_TYPE (ptr),
4985 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4986 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4987 gcc_assert (!new_bb);
4988 data_ref
4989 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4990 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4991 new_stmt = gimple_build_assign (vec_dest, data_ref);
4992 new_temp = make_ssa_name (vec_dest, new_stmt);
4993 gimple_assign_set_lhs (new_stmt, new_temp);
4994 if (pe)
4996 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4997 gcc_assert (!new_bb);
4999 else
5000 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5002 msq_init = gimple_assign_lhs (new_stmt);
5005 /* 4. Create realignment token using a target builtin, if available.
5006 It is done either inside the containing loop, or before LOOP (as
5007 determined above). */
5009 if (targetm.vectorize.builtin_mask_for_load)
5011 gcall *new_stmt;
5012 tree builtin_decl;
5014 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5015 if (!init_addr)
5017 /* Generate the INIT_ADDR computation outside LOOP. */
5018 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5019 NULL_TREE, loop);
5020 if (loop)
5022 pe = loop_preheader_edge (loop);
5023 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5024 gcc_assert (!new_bb);
5026 else
5027 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5030 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5031 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5032 vec_dest =
5033 vect_create_destination_var (scalar_dest,
5034 gimple_call_return_type (new_stmt));
5035 new_temp = make_ssa_name (vec_dest, new_stmt);
5036 gimple_call_set_lhs (new_stmt, new_temp);
5038 if (compute_in_loop)
5039 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5040 else
5042 /* Generate the misalignment computation outside LOOP. */
5043 pe = loop_preheader_edge (loop);
5044 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5045 gcc_assert (!new_bb);
5048 *realignment_token = gimple_call_lhs (new_stmt);
5050 /* The result of the CALL_EXPR to this builtin is determined from
5051 the value of the parameter and no global variables are touched
5052 which makes the builtin a "const" function. Requiring the
5053 builtin to have the "const" attribute makes it unnecessary
5054 to call mark_call_clobbered. */
5055 gcc_assert (TREE_READONLY (builtin_decl));
5058 if (alignment_support_scheme == dr_explicit_realign)
5059 return msq;
5061 gcc_assert (!compute_in_loop);
5062 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5065 /* 5. Create msq = phi <msq_init, lsq> in loop */
5067 pe = loop_preheader_edge (containing_loop);
5068 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5069 msq = make_ssa_name (vec_dest);
5070 phi_stmt = create_phi_node (msq, containing_loop->header);
5071 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5073 return msq;
5077 /* Function vect_grouped_load_supported.
5079 Returns TRUE if even and odd permutations are supported,
5080 and FALSE otherwise. */
5082 bool
5083 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5085 machine_mode mode = TYPE_MODE (vectype);
5087 /* vect_permute_load_chain requires the group size to be equal to 3 or
5088 be a power of two. */
5089 if (count != 3 && exact_log2 (count) == -1)
5091 if (dump_enabled_p ())
5092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5093 "the size of the group of accesses"
5094 " is not a power of 2 or not equal to 3\n");
5095 return false;
5098 /* Check that the permutation is supported. */
5099 if (VECTOR_MODE_P (mode))
5101 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5102 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5104 if (count == 3)
5106 unsigned int k;
5107 for (k = 0; k < 3; k++)
5109 for (i = 0; i < nelt; i++)
5110 if (3 * i + k < 2 * nelt)
5111 sel[i] = 3 * i + k;
5112 else
5113 sel[i] = 0;
5114 if (!can_vec_perm_p (mode, false, sel))
5116 if (dump_enabled_p ())
5117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5118 "shuffle of 3 loads is not supported by"
5119 " target\n");
5120 return false;
5122 for (i = 0, j = 0; i < nelt; i++)
5123 if (3 * i + k < 2 * nelt)
5124 sel[i] = i;
5125 else
5126 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5127 if (!can_vec_perm_p (mode, false, sel))
5129 if (dump_enabled_p ())
5130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5131 "shuffle of 3 loads is not supported by"
5132 " target\n");
5133 return false;
5136 return true;
5138 else
5140 /* If length is not equal to 3 then only power of 2 is supported. */
5141 gcc_assert (exact_log2 (count) != -1);
5142 for (i = 0; i < nelt; i++)
5143 sel[i] = i * 2;
5144 if (can_vec_perm_p (mode, false, sel))
5146 for (i = 0; i < nelt; i++)
5147 sel[i] = i * 2 + 1;
5148 if (can_vec_perm_p (mode, false, sel))
5149 return true;
5154 if (dump_enabled_p ())
5155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5156 "extract even/odd not supported by target\n");
5157 return false;
5160 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5161 type VECTYPE. */
5163 bool
5164 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5166 return vect_lanes_optab_supported_p ("vec_load_lanes",
5167 vec_load_lanes_optab,
5168 vectype, count);
5171 /* Function vect_permute_load_chain.
5173 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5174 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5175 the input data correctly. Return the final references for loads in
5176 RESULT_CHAIN.
5178 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5179 The input is 4 vectors each containing 8 elements. We assign a number to each
5180 element, the input sequence is:
5182 1st vec: 0 1 2 3 4 5 6 7
5183 2nd vec: 8 9 10 11 12 13 14 15
5184 3rd vec: 16 17 18 19 20 21 22 23
5185 4th vec: 24 25 26 27 28 29 30 31
5187 The output sequence should be:
5189 1st vec: 0 4 8 12 16 20 24 28
5190 2nd vec: 1 5 9 13 17 21 25 29
5191 3rd vec: 2 6 10 14 18 22 26 30
5192 4th vec: 3 7 11 15 19 23 27 31
5194 i.e., the first output vector should contain the first elements of each
5195 interleaving group, etc.
5197 We use extract_even/odd instructions to create such output. The input of
5198 each extract_even/odd operation is two vectors
5199 1st vec 2nd vec
5200 0 1 2 3 4 5 6 7
5202 and the output is the vector of extracted even/odd elements. The output of
5203 extract_even will be: 0 2 4 6
5204 and of extract_odd: 1 3 5 7
5207 The permutation is done in log LENGTH stages. In each stage extract_even
5208 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5209 their order. In our example,
5211 E1: extract_even (1st vec, 2nd vec)
5212 E2: extract_odd (1st vec, 2nd vec)
5213 E3: extract_even (3rd vec, 4th vec)
5214 E4: extract_odd (3rd vec, 4th vec)
5216 The output for the first stage will be:
5218 E1: 0 2 4 6 8 10 12 14
5219 E2: 1 3 5 7 9 11 13 15
5220 E3: 16 18 20 22 24 26 28 30
5221 E4: 17 19 21 23 25 27 29 31
5223 In order to proceed and create the correct sequence for the next stage (or
5224 for the correct output, if the second stage is the last one, as in our
5225 example), we first put the output of extract_even operation and then the
5226 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5227 The input for the second stage is:
5229 1st vec (E1): 0 2 4 6 8 10 12 14
5230 2nd vec (E3): 16 18 20 22 24 26 28 30
5231 3rd vec (E2): 1 3 5 7 9 11 13 15
5232 4th vec (E4): 17 19 21 23 25 27 29 31
5234 The output of the second stage:
5236 E1: 0 4 8 12 16 20 24 28
5237 E2: 2 6 10 14 18 22 26 30
5238 E3: 1 5 9 13 17 21 25 29
5239 E4: 3 7 11 15 19 23 27 31
5241 And RESULT_CHAIN after reordering:
5243 1st vec (E1): 0 4 8 12 16 20 24 28
5244 2nd vec (E3): 1 5 9 13 17 21 25 29
5245 3rd vec (E2): 2 6 10 14 18 22 26 30
5246 4th vec (E4): 3 7 11 15 19 23 27 31. */
5248 static void
5249 vect_permute_load_chain (vec<tree> dr_chain,
5250 unsigned int length,
5251 gimple *stmt,
5252 gimple_stmt_iterator *gsi,
5253 vec<tree> *result_chain)
5255 tree data_ref, first_vect, second_vect;
5256 tree perm_mask_even, perm_mask_odd;
5257 tree perm3_mask_low, perm3_mask_high;
5258 gimple *perm_stmt;
5259 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5260 unsigned int i, j, log_length = exact_log2 (length);
5261 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5262 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5264 result_chain->quick_grow (length);
5265 memcpy (result_chain->address (), dr_chain.address (),
5266 length * sizeof (tree));
5268 if (length == 3)
5270 unsigned int k;
5272 for (k = 0; k < 3; k++)
5274 for (i = 0; i < nelt; i++)
5275 if (3 * i + k < 2 * nelt)
5276 sel[i] = 3 * i + k;
5277 else
5278 sel[i] = 0;
5279 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5281 for (i = 0, j = 0; i < nelt; i++)
5282 if (3 * i + k < 2 * nelt)
5283 sel[i] = i;
5284 else
5285 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5287 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5289 first_vect = dr_chain[0];
5290 second_vect = dr_chain[1];
5292 /* Create interleaving stmt (low part of):
5293 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5294 ...}> */
5295 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5296 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5297 second_vect, perm3_mask_low);
5298 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5300 /* Create interleaving stmt (high part of):
5301 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5302 ...}> */
5303 first_vect = data_ref;
5304 second_vect = dr_chain[2];
5305 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5306 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5307 second_vect, perm3_mask_high);
5308 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5309 (*result_chain)[k] = data_ref;
5312 else
5314 /* If length is not equal to 3 then only power of 2 is supported. */
5315 gcc_assert (exact_log2 (length) != -1);
5317 for (i = 0; i < nelt; ++i)
5318 sel[i] = i * 2;
5319 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5321 for (i = 0; i < nelt; ++i)
5322 sel[i] = i * 2 + 1;
5323 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5325 for (i = 0; i < log_length; i++)
5327 for (j = 0; j < length; j += 2)
5329 first_vect = dr_chain[j];
5330 second_vect = dr_chain[j+1];
5332 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5333 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5334 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5335 first_vect, second_vect,
5336 perm_mask_even);
5337 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5338 (*result_chain)[j/2] = data_ref;
5340 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5341 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5342 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5343 first_vect, second_vect,
5344 perm_mask_odd);
5345 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5346 (*result_chain)[j/2+length/2] = data_ref;
5348 memcpy (dr_chain.address (), result_chain->address (),
5349 length * sizeof (tree));
5354 /* Function vect_shift_permute_load_chain.
5356 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5357 sequence of stmts to reorder the input data accordingly.
5358 Return the final references for loads in RESULT_CHAIN.
5359 Return true if successed, false otherwise.
5361 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5362 The input is 3 vectors each containing 8 elements. We assign a
5363 number to each element, the input sequence is:
5365 1st vec: 0 1 2 3 4 5 6 7
5366 2nd vec: 8 9 10 11 12 13 14 15
5367 3rd vec: 16 17 18 19 20 21 22 23
5369 The output sequence should be:
5371 1st vec: 0 3 6 9 12 15 18 21
5372 2nd vec: 1 4 7 10 13 16 19 22
5373 3rd vec: 2 5 8 11 14 17 20 23
5375 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5377 First we shuffle all 3 vectors to get correct elements order:
5379 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5380 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5381 3rd vec: (16 19 22) (17 20 23) (18 21)
5383 Next we unite and shift vector 3 times:
5385 1st step:
5386 shift right by 6 the concatenation of:
5387 "1st vec" and "2nd vec"
5388 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5389 "2nd vec" and "3rd vec"
5390 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5391 "3rd vec" and "1st vec"
5392 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5393 | New vectors |
5395 So that now new vectors are:
5397 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5398 2nd vec: (10 13) (16 19 22) (17 20 23)
5399 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5401 2nd step:
5402 shift right by 5 the concatenation of:
5403 "1st vec" and "3rd vec"
5404 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5405 "2nd vec" and "1st vec"
5406 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5407 "3rd vec" and "2nd vec"
5408 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5409 | New vectors |
5411 So that now new vectors are:
5413 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5414 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5415 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5417 3rd step:
5418 shift right by 5 the concatenation of:
5419 "1st vec" and "1st vec"
5420 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5421 shift right by 3 the concatenation of:
5422 "2nd vec" and "2nd vec"
5423 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5424 | New vectors |
5426 So that now all vectors are READY:
5427 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5428 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5429 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5431 This algorithm is faster than one in vect_permute_load_chain if:
5432 1. "shift of a concatination" is faster than general permutation.
5433 This is usually so.
5434 2. The TARGET machine can't execute vector instructions in parallel.
5435 This is because each step of the algorithm depends on previous.
5436 The algorithm in vect_permute_load_chain is much more parallel.
5438 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5441 static bool
5442 vect_shift_permute_load_chain (vec<tree> dr_chain,
5443 unsigned int length,
5444 gimple *stmt,
5445 gimple_stmt_iterator *gsi,
5446 vec<tree> *result_chain)
5448 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5449 tree perm2_mask1, perm2_mask2, perm3_mask;
5450 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5451 gimple *perm_stmt;
5453 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5454 unsigned int i;
5455 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5456 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5457 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5458 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5460 result_chain->quick_grow (length);
5461 memcpy (result_chain->address (), dr_chain.address (),
5462 length * sizeof (tree));
5464 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5466 unsigned int j, log_length = exact_log2 (length);
5467 for (i = 0; i < nelt / 2; ++i)
5468 sel[i] = i * 2;
5469 for (i = 0; i < nelt / 2; ++i)
5470 sel[nelt / 2 + i] = i * 2 + 1;
5471 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5473 if (dump_enabled_p ())
5474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5475 "shuffle of 2 fields structure is not \
5476 supported by target\n");
5477 return false;
5479 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5481 for (i = 0; i < nelt / 2; ++i)
5482 sel[i] = i * 2 + 1;
5483 for (i = 0; i < nelt / 2; ++i)
5484 sel[nelt / 2 + i] = i * 2;
5485 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5487 if (dump_enabled_p ())
5488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5489 "shuffle of 2 fields structure is not \
5490 supported by target\n");
5491 return false;
5493 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5495 /* Generating permutation constant to shift all elements.
5496 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5497 for (i = 0; i < nelt; i++)
5498 sel[i] = nelt / 2 + i;
5499 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5501 if (dump_enabled_p ())
5502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5503 "shift permutation is not supported by target\n");
5504 return false;
5506 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5508 /* Generating permutation constant to select vector from 2.
5509 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5510 for (i = 0; i < nelt / 2; i++)
5511 sel[i] = i;
5512 for (i = nelt / 2; i < nelt; i++)
5513 sel[i] = nelt + i;
5514 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5516 if (dump_enabled_p ())
5517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5518 "select is not supported by target\n");
5519 return false;
5521 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5523 for (i = 0; i < log_length; i++)
5525 for (j = 0; j < length; j += 2)
5527 first_vect = dr_chain[j];
5528 second_vect = dr_chain[j + 1];
5530 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5531 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5532 first_vect, first_vect,
5533 perm2_mask1);
5534 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5535 vect[0] = data_ref;
5537 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5538 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5539 second_vect, second_vect,
5540 perm2_mask2);
5541 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5542 vect[1] = data_ref;
5544 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5545 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5546 vect[0], vect[1], shift1_mask);
5547 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5548 (*result_chain)[j/2 + length/2] = data_ref;
5550 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5551 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5552 vect[0], vect[1], select_mask);
5553 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5554 (*result_chain)[j/2] = data_ref;
5556 memcpy (dr_chain.address (), result_chain->address (),
5557 length * sizeof (tree));
5559 return true;
5561 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5563 unsigned int k = 0, l = 0;
5565 /* Generating permutation constant to get all elements in rigth order.
5566 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5567 for (i = 0; i < nelt; i++)
5569 if (3 * k + (l % 3) >= nelt)
5571 k = 0;
5572 l += (3 - (nelt % 3));
5574 sel[i] = 3 * k + (l % 3);
5575 k++;
5577 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5579 if (dump_enabled_p ())
5580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5581 "shuffle of 3 fields structure is not \
5582 supported by target\n");
5583 return false;
5585 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5587 /* Generating permutation constant to shift all elements.
5588 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5589 for (i = 0; i < nelt; i++)
5590 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5591 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5593 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5595 "shift permutation is not supported by target\n");
5596 return false;
5598 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5600 /* Generating permutation constant to shift all elements.
5601 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5602 for (i = 0; i < nelt; i++)
5603 sel[i] = 2 * (nelt / 3) + 1 + i;
5604 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5606 if (dump_enabled_p ())
5607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5608 "shift permutation is not supported by target\n");
5609 return false;
5611 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5613 /* Generating permutation constant to shift all elements.
5614 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5615 for (i = 0; i < nelt; i++)
5616 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5617 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5619 if (dump_enabled_p ())
5620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5621 "shift permutation is not supported by target\n");
5622 return false;
5624 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5626 /* Generating permutation constant to shift all elements.
5627 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5628 for (i = 0; i < nelt; i++)
5629 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5630 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5632 if (dump_enabled_p ())
5633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5634 "shift permutation is not supported by target\n");
5635 return false;
5637 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5639 for (k = 0; k < 3; k++)
5641 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5642 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5643 dr_chain[k], dr_chain[k],
5644 perm3_mask);
5645 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5646 vect[k] = data_ref;
5649 for (k = 0; k < 3; k++)
5651 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5652 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5653 vect[k % 3], vect[(k + 1) % 3],
5654 shift1_mask);
5655 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5656 vect_shift[k] = data_ref;
5659 for (k = 0; k < 3; k++)
5661 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5662 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5663 vect_shift[(4 - k) % 3],
5664 vect_shift[(3 - k) % 3],
5665 shift2_mask);
5666 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5667 vect[k] = data_ref;
5670 (*result_chain)[3 - (nelt % 3)] = vect[2];
5672 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5673 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5674 vect[0], shift3_mask);
5675 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5676 (*result_chain)[nelt % 3] = data_ref;
5678 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5679 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5680 vect[1], shift4_mask);
5681 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5682 (*result_chain)[0] = data_ref;
5683 return true;
5685 return false;
5688 /* Function vect_transform_grouped_load.
5690 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5691 to perform their permutation and ascribe the result vectorized statements to
5692 the scalar statements.
5695 void
5696 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5697 gimple_stmt_iterator *gsi)
5699 machine_mode mode;
5700 vec<tree> result_chain = vNULL;
5702 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5703 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5704 vectors, that are ready for vector computation. */
5705 result_chain.create (size);
5707 /* If reassociation width for vector type is 2 or greater target machine can
5708 execute 2 or more vector instructions in parallel. Otherwise try to
5709 get chain for loads group using vect_shift_permute_load_chain. */
5710 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5711 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5712 || exact_log2 (size) != -1
5713 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5714 gsi, &result_chain))
5715 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5716 vect_record_grouped_load_vectors (stmt, result_chain);
5717 result_chain.release ();
5720 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5721 generated as part of the vectorization of STMT. Assign the statement
5722 for each vector to the associated scalar statement. */
5724 void
5725 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5727 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5728 gimple *next_stmt, *new_stmt;
5729 unsigned int i, gap_count;
5730 tree tmp_data_ref;
5732 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5733 Since we scan the chain starting from it's first node, their order
5734 corresponds the order of data-refs in RESULT_CHAIN. */
5735 next_stmt = first_stmt;
5736 gap_count = 1;
5737 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5739 if (!next_stmt)
5740 break;
5742 /* Skip the gaps. Loads created for the gaps will be removed by dead
5743 code elimination pass later. No need to check for the first stmt in
5744 the group, since it always exists.
5745 GROUP_GAP is the number of steps in elements from the previous
5746 access (if there is no gap GROUP_GAP is 1). We skip loads that
5747 correspond to the gaps. */
5748 if (next_stmt != first_stmt
5749 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5751 gap_count++;
5752 continue;
5755 while (next_stmt)
5757 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5758 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5759 copies, and we put the new vector statement in the first available
5760 RELATED_STMT. */
5761 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5762 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5763 else
5765 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5767 gimple *prev_stmt =
5768 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5769 gimple *rel_stmt =
5770 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5771 while (rel_stmt)
5773 prev_stmt = rel_stmt;
5774 rel_stmt =
5775 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5778 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5779 new_stmt;
5783 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5784 gap_count = 1;
5785 /* If NEXT_STMT accesses the same DR as the previous statement,
5786 put the same TMP_DATA_REF as its vectorized statement; otherwise
5787 get the next data-ref from RESULT_CHAIN. */
5788 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5789 break;
5794 /* Function vect_force_dr_alignment_p.
5796 Returns whether the alignment of a DECL can be forced to be aligned
5797 on ALIGNMENT bit boundary. */
5799 bool
5800 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5802 if (TREE_CODE (decl) != VAR_DECL)
5803 return false;
5805 if (decl_in_symtab_p (decl)
5806 && !symtab_node::get (decl)->can_increase_alignment_p ())
5807 return false;
5809 if (TREE_STATIC (decl))
5810 return (alignment <= MAX_OFILE_ALIGNMENT);
5811 else
5812 return (alignment <= MAX_STACK_ALIGNMENT);
5816 /* Return whether the data reference DR is supported with respect to its
5817 alignment.
5818 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5819 it is aligned, i.e., check if it is possible to vectorize it with different
5820 alignment. */
5822 enum dr_alignment_support
5823 vect_supportable_dr_alignment (struct data_reference *dr,
5824 bool check_aligned_accesses)
5826 gimple *stmt = DR_STMT (dr);
5827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5828 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5829 machine_mode mode = TYPE_MODE (vectype);
5830 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5831 struct loop *vect_loop = NULL;
5832 bool nested_in_vect_loop = false;
5834 if (aligned_access_p (dr) && !check_aligned_accesses)
5835 return dr_aligned;
5837 /* For now assume all conditional loads/stores support unaligned
5838 access without any special code. */
5839 if (is_gimple_call (stmt)
5840 && gimple_call_internal_p (stmt)
5841 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5842 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5843 return dr_unaligned_supported;
5845 if (loop_vinfo)
5847 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5848 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5851 /* Possibly unaligned access. */
5853 /* We can choose between using the implicit realignment scheme (generating
5854 a misaligned_move stmt) and the explicit realignment scheme (generating
5855 aligned loads with a REALIGN_LOAD). There are two variants to the
5856 explicit realignment scheme: optimized, and unoptimized.
5857 We can optimize the realignment only if the step between consecutive
5858 vector loads is equal to the vector size. Since the vector memory
5859 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5860 is guaranteed that the misalignment amount remains the same throughout the
5861 execution of the vectorized loop. Therefore, we can create the
5862 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5863 at the loop preheader.
5865 However, in the case of outer-loop vectorization, when vectorizing a
5866 memory access in the inner-loop nested within the LOOP that is now being
5867 vectorized, while it is guaranteed that the misalignment of the
5868 vectorized memory access will remain the same in different outer-loop
5869 iterations, it is *not* guaranteed that is will remain the same throughout
5870 the execution of the inner-loop. This is because the inner-loop advances
5871 with the original scalar step (and not in steps of VS). If the inner-loop
5872 step happens to be a multiple of VS, then the misalignment remains fixed
5873 and we can use the optimized realignment scheme. For example:
5875 for (i=0; i<N; i++)
5876 for (j=0; j<M; j++)
5877 s += a[i+j];
5879 When vectorizing the i-loop in the above example, the step between
5880 consecutive vector loads is 1, and so the misalignment does not remain
5881 fixed across the execution of the inner-loop, and the realignment cannot
5882 be optimized (as illustrated in the following pseudo vectorized loop):
5884 for (i=0; i<N; i+=4)
5885 for (j=0; j<M; j++){
5886 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5887 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5888 // (assuming that we start from an aligned address).
5891 We therefore have to use the unoptimized realignment scheme:
5893 for (i=0; i<N; i+=4)
5894 for (j=k; j<M; j+=4)
5895 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5896 // that the misalignment of the initial address is
5897 // 0).
5899 The loop can then be vectorized as follows:
5901 for (k=0; k<4; k++){
5902 rt = get_realignment_token (&vp[k]);
5903 for (i=0; i<N; i+=4){
5904 v1 = vp[i+k];
5905 for (j=k; j<M; j+=4){
5906 v2 = vp[i+j+VS-1];
5907 va = REALIGN_LOAD <v1,v2,rt>;
5908 vs += va;
5909 v1 = v2;
5912 } */
5914 if (DR_IS_READ (dr))
5916 bool is_packed = false;
5917 tree type = (TREE_TYPE (DR_REF (dr)));
5919 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5920 && (!targetm.vectorize.builtin_mask_for_load
5921 || targetm.vectorize.builtin_mask_for_load ()))
5923 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5924 if ((nested_in_vect_loop
5925 && (TREE_INT_CST_LOW (DR_STEP (dr))
5926 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5927 || !loop_vinfo)
5928 return dr_explicit_realign;
5929 else
5930 return dr_explicit_realign_optimized;
5932 if (!known_alignment_for_access_p (dr))
5933 is_packed = not_size_aligned (DR_REF (dr));
5935 if ((TYPE_USER_ALIGN (type) && !is_packed)
5936 || targetm.vectorize.
5937 support_vector_misalignment (mode, type,
5938 DR_MISALIGNMENT (dr), is_packed))
5939 /* Can't software pipeline the loads, but can at least do them. */
5940 return dr_unaligned_supported;
5942 else
5944 bool is_packed = false;
5945 tree type = (TREE_TYPE (DR_REF (dr)));
5947 if (!known_alignment_for_access_p (dr))
5948 is_packed = not_size_aligned (DR_REF (dr));
5950 if ((TYPE_USER_ALIGN (type) && !is_packed)
5951 || targetm.vectorize.
5952 support_vector_misalignment (mode, type,
5953 DR_MISALIGNMENT (dr), is_packed))
5954 return dr_unaligned_supported;
5957 /* Unsupported. */
5958 return dr_unaligned_unsupported;