always define DYNAMIC_CHAIN_ADDRESS
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobe653c68a8c17454be6c16e116ffff1f764e77b52
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "predict.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "tm_p.h"
36 #include "target.h"
37 #include "gimple-pretty-print.h"
38 #include "internal-fn.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-chrec.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "diagnostic-core.h"
51 #include "cgraph.h"
52 #include "expr.h"
53 #include "insn-codes.h"
54 #include "optabs-tree.h"
55 #include "builtins.h"
56 #include "params.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 limit_p = !targetm.array_mode_supported_p (mode, count);
70 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
71 MODE_INT, limit_p);
73 if (array_mode == BLKmode)
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "cannot use %s<%s><%s>\n", name,
87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88 return false;
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE, vect_location,
93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94 GET_MODE_NAME (mode));
96 return true;
100 /* Return the smallest scalar part of STMT.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
117 tree
118 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
119 HOST_WIDE_INT *rhs_size_unit)
121 tree scalar_type = gimple_expr_type (stmt);
122 HOST_WIDE_INT lhs, rhs;
124 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
126 if (is_gimple_assign (stmt)
127 && (gimple_assign_cast_p (stmt)
128 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
129 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
130 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
132 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
134 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
135 if (rhs < lhs)
136 scalar_type = rhs_type;
139 *lhs_size_unit = lhs;
140 *rhs_size_unit = rhs;
141 return scalar_type;
145 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
146 tested at run-time. Return TRUE if DDR was successfully inserted.
147 Return false if versioning is not supported. */
149 static bool
150 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
152 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
154 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
155 return false;
157 if (dump_enabled_p ())
159 dump_printf_loc (MSG_NOTE, vect_location,
160 "mark for run-time aliasing test between ");
161 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
162 dump_printf (MSG_NOTE, " and ");
163 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
164 dump_printf (MSG_NOTE, "\n");
167 if (optimize_loop_nest_for_size_p (loop))
169 if (dump_enabled_p ())
170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
171 "versioning not supported when optimizing"
172 " for size.\n");
173 return false;
176 /* FORNOW: We don't support versioning with outer-loop vectorization. */
177 if (loop->inner)
179 if (dump_enabled_p ())
180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
181 "versioning not yet supported for outer-loops.\n");
182 return false;
185 /* FORNOW: We don't support creating runtime alias tests for non-constant
186 step. */
187 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
188 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192 "versioning not yet supported for non-constant "
193 "step\n");
194 return false;
197 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
198 return true;
202 /* Function vect_analyze_data_ref_dependence.
204 Return TRUE if there (might) exist a dependence between a memory-reference
205 DRA and a memory-reference DRB. When versioning for alias may check a
206 dependence at run-time, return FALSE. Adjust *MAX_VF according to
207 the data dependence. */
209 static bool
210 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
211 loop_vec_info loop_vinfo, int *max_vf)
213 unsigned int i;
214 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
215 struct data_reference *dra = DDR_A (ddr);
216 struct data_reference *drb = DDR_B (ddr);
217 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
218 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
219 lambda_vector dist_v;
220 unsigned int loop_depth;
222 /* In loop analysis all data references should be vectorizable. */
223 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
224 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
225 gcc_unreachable ();
227 /* Independent data accesses. */
228 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
229 return false;
231 if (dra == drb
232 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
233 return false;
235 /* Even if we have an anti-dependence then, as the vectorized loop covers at
236 least two scalar iterations, there is always also a true dependence.
237 As the vectorizer does not re-order loads and stores we can ignore
238 the anti-dependence if TBAA can disambiguate both DRs similar to the
239 case with known negative distance anti-dependences (positive
240 distance anti-dependences would violate TBAA constraints). */
241 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
242 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
243 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
244 get_alias_set (DR_REF (drb))))
245 return false;
247 /* Unknown data dependence. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
250 /* If user asserted safelen consecutive iterations can be
251 executed concurrently, assume independence. */
252 if (loop->safelen >= 2)
254 if (loop->safelen < *max_vf)
255 *max_vf = loop->safelen;
256 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
257 return false;
260 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
261 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
266 "versioning for alias not supported for: "
267 "can't determine dependence between ");
268 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
269 DR_REF (dra));
270 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
271 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
272 DR_REF (drb));
273 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
275 return true;
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 "versioning for alias required: "
282 "can't determine dependence between ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (dra));
285 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 DR_REF (drb));
288 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
291 /* Add to list of ddrs that need to be tested at run-time. */
292 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
295 /* Known data dependence. */
296 if (DDR_NUM_DIST_VECTS (ddr) == 0)
298 /* If user asserted safelen consecutive iterations can be
299 executed concurrently, assume independence. */
300 if (loop->safelen >= 2)
302 if (loop->safelen < *max_vf)
303 *max_vf = loop->safelen;
304 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
305 return false;
308 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
309 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
314 "versioning for alias not supported for: "
315 "bad dist vector for ");
316 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
317 DR_REF (dra));
318 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 DR_REF (drb));
321 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
323 return true;
326 if (dump_enabled_p ())
328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
329 "versioning for alias required: "
330 "bad dist vector for ");
331 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
332 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
334 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
336 /* Add to list of ddrs that need to be tested at run-time. */
337 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
340 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
341 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
343 int dist = dist_v[loop_depth];
345 if (dump_enabled_p ())
346 dump_printf_loc (MSG_NOTE, vect_location,
347 "dependence distance = %d.\n", dist);
349 if (dist == 0)
351 if (dump_enabled_p ())
353 dump_printf_loc (MSG_NOTE, vect_location,
354 "dependence distance == 0 between ");
355 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
356 dump_printf (MSG_NOTE, " and ");
357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
358 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
361 /* When we perform grouped accesses and perform implicit CSE
362 by detecting equal accesses and doing disambiguation with
363 runtime alias tests like for
364 .. = a[i];
365 .. = a[i+1];
366 a[i] = ..;
367 a[i+1] = ..;
368 *p = ..;
369 .. = a[i];
370 .. = a[i+1];
371 where we will end up loading { a[i], a[i+1] } once, make
372 sure that inserting group loads before the first load and
373 stores after the last store will do the right thing.
374 Similar for groups like
375 a[i] = ...;
376 ... = a[i];
377 a[i+1] = ...;
378 where loads from the group interleave with the store. */
379 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
380 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
382 gimple *earlier_stmt;
383 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
384 if (DR_IS_WRITE
385 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "READ_WRITE dependence in interleaving."
390 "\n");
391 return true;
395 continue;
398 if (dist > 0 && DDR_REVERSED_P (ddr))
400 /* If DDR_REVERSED_P the order of the data-refs in DDR was
401 reversed (to make distance vector positive), and the actual
402 distance is negative. */
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
405 "dependence distance negative.\n");
406 /* Record a negative dependence distance to later limit the
407 amount of stmt copying / unrolling we can perform.
408 Only need to handle read-after-write dependence. */
409 if (DR_IS_READ (drb)
410 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
411 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
412 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
413 continue;
416 if (abs (dist) >= 2
417 && abs (dist) < *max_vf)
419 /* The dependence distance requires reduction of the maximal
420 vectorization factor. */
421 *max_vf = abs (dist);
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_NOTE, vect_location,
424 "adjusting maximal vectorization factor to %i\n",
425 *max_vf);
428 if (abs (dist) >= *max_vf)
430 /* Dependence distance does not create dependence, as far as
431 vectorization is concerned, in this case. */
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_NOTE, vect_location,
434 "dependence distance >= VF.\n");
435 continue;
438 if (dump_enabled_p ())
440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
441 "not vectorized, possible dependence "
442 "between data-refs ");
443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
444 dump_printf (MSG_NOTE, " and ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
446 dump_printf (MSG_NOTE, "\n");
449 return true;
452 return false;
455 /* Function vect_analyze_data_ref_dependences.
457 Examine all the data references in the loop, and make sure there do not
458 exist any data dependences between them. Set *MAX_VF according to
459 the maximum vectorization factor the data dependences allow. */
461 bool
462 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
464 unsigned int i;
465 struct data_dependence_relation *ddr;
467 if (dump_enabled_p ())
468 dump_printf_loc (MSG_NOTE, vect_location,
469 "=== vect_analyze_data_ref_dependences ===\n");
471 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
472 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
473 &LOOP_VINFO_DDRS (loop_vinfo),
474 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
475 return false;
477 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
478 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
479 return false;
481 return true;
485 /* Function vect_slp_analyze_data_ref_dependence.
487 Return TRUE if there (might) exist a dependence between a memory-reference
488 DRA and a memory-reference DRB. When versioning for alias may check a
489 dependence at run-time, return FALSE. Adjust *MAX_VF according to
490 the data dependence. */
492 static bool
493 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
495 struct data_reference *dra = DDR_A (ddr);
496 struct data_reference *drb = DDR_B (ddr);
498 /* We need to check dependences of statements marked as unvectorizable
499 as well, they still can prohibit vectorization. */
501 /* Independent data accesses. */
502 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
503 return false;
505 if (dra == drb)
506 return false;
508 /* Read-read is OK. */
509 if (DR_IS_READ (dra) && DR_IS_READ (drb))
510 return false;
512 /* If dra and drb are part of the same interleaving chain consider
513 them independent. */
514 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
515 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
516 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
517 return false;
519 /* Unknown data dependence. */
520 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
522 if (dump_enabled_p ())
524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525 "can't determine dependence between ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
527 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
528 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
529 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
532 else if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "determined dependence between ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
537 dump_printf (MSG_NOTE, " and ");
538 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
539 dump_printf (MSG_NOTE, "\n");
542 /* We do not vectorize basic blocks with write-write dependencies. */
543 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
544 return true;
546 /* If we have a read-write dependence check that the load is before the store.
547 When we vectorize basic blocks, vector load can be only before
548 corresponding scalar load, and vector store can be only after its
549 corresponding scalar store. So the order of the acceses is preserved in
550 case the load is before the store. */
551 gimple *earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
552 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
554 /* That only holds for load-store pairs taking part in vectorization. */
555 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
556 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
557 return false;
560 return true;
564 /* Function vect_analyze_data_ref_dependences.
566 Examine all the data references in the basic-block, and make sure there
567 do not exist any data dependences between them. Set *MAX_VF according to
568 the maximum vectorization factor the data dependences allow. */
570 bool
571 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
573 struct data_dependence_relation *ddr;
574 unsigned int i;
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_NOTE, vect_location,
578 "=== vect_slp_analyze_data_ref_dependences ===\n");
580 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
581 &BB_VINFO_DDRS (bb_vinfo),
582 vNULL, true))
583 return false;
585 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
586 if (vect_slp_analyze_data_ref_dependence (ddr))
587 return false;
589 return true;
593 /* Function vect_compute_data_ref_alignment
595 Compute the misalignment of the data reference DR.
597 Output:
598 1. If during the misalignment computation it is found that the data reference
599 cannot be vectorized then false is returned.
600 2. DR_MISALIGNMENT (DR) is defined.
602 FOR NOW: No analysis is actually performed. Misalignment is calculated
603 only for trivial cases. TODO. */
605 static bool
606 vect_compute_data_ref_alignment (struct data_reference *dr)
608 gimple *stmt = DR_STMT (dr);
609 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
610 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
611 struct loop *loop = NULL;
612 tree ref = DR_REF (dr);
613 tree vectype;
614 tree base, base_addr;
615 tree misalign = NULL_TREE;
616 tree aligned_to;
617 unsigned HOST_WIDE_INT alignment;
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_NOTE, vect_location,
621 "vect_compute_data_ref_alignment:\n");
623 if (loop_vinfo)
624 loop = LOOP_VINFO_LOOP (loop_vinfo);
626 /* Initialize misalignment to unknown. */
627 SET_DR_MISALIGNMENT (dr, -1);
629 /* Strided accesses perform only component accesses, misalignment information
630 is irrelevant for them. */
631 if (STMT_VINFO_STRIDED_P (stmt_info)
632 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
633 return true;
635 if (tree_fits_shwi_p (DR_STEP (dr)))
636 misalign = DR_INIT (dr);
637 aligned_to = DR_ALIGNED_TO (dr);
638 base_addr = DR_BASE_ADDRESS (dr);
639 vectype = STMT_VINFO_VECTYPE (stmt_info);
641 /* In case the dataref is in an inner-loop of the loop that is being
642 vectorized (LOOP), we use the base and misalignment information
643 relative to the outer-loop (LOOP). This is ok only if the misalignment
644 stays the same throughout the execution of the inner-loop, which is why
645 we have to check that the stride of the dataref in the inner-loop evenly
646 divides by the vector size. */
647 if (loop && nested_in_vect_loop_p (loop, stmt))
649 tree step = DR_STEP (dr);
651 if (tree_fits_shwi_p (step)
652 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_NOTE, vect_location,
656 "inner step divides the vector-size.\n");
657 misalign = STMT_VINFO_DR_INIT (stmt_info);
658 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
659 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
661 else
663 if (dump_enabled_p ())
664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "inner step doesn't divide the vector-size.\n");
666 misalign = NULL_TREE;
670 /* Similarly we can only use base and misalignment information relative to
671 an innermost loop if the misalignment stays the same throughout the
672 execution of the loop. As above, this is the case if the stride of
673 the dataref evenly divides by the vector size. */
674 else
676 tree step = DR_STEP (dr);
677 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
679 if (tree_fits_shwi_p (step)
680 && ((tree_to_shwi (step) * vf)
681 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
683 if (dump_enabled_p ())
684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
685 "step doesn't divide the vector-size.\n");
686 misalign = NULL_TREE;
690 /* To look at alignment of the base we have to preserve an inner MEM_REF
691 as that carries alignment information of the actual access. */
692 base = ref;
693 while (handled_component_p (base))
694 base = TREE_OPERAND (base, 0);
695 if (TREE_CODE (base) == MEM_REF)
696 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
697 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
698 unsigned int base_alignment = get_object_alignment (base);
700 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
701 DR_VECT_AUX (dr)->base_element_aligned = true;
703 alignment = TYPE_ALIGN_UNIT (vectype);
705 if ((compare_tree_int (aligned_to, alignment) < 0)
706 || !misalign)
708 if (dump_enabled_p ())
710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
711 "Unknown alignment for access: ");
712 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
713 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
715 return true;
718 if (base_alignment < TYPE_ALIGN (vectype))
720 /* Strip an inner MEM_REF to a bare decl if possible. */
721 if (TREE_CODE (base) == MEM_REF
722 && integer_zerop (TREE_OPERAND (base, 1))
723 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
724 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
726 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
728 if (dump_enabled_p ())
730 dump_printf_loc (MSG_NOTE, vect_location,
731 "can't force alignment of ref: ");
732 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
733 dump_printf (MSG_NOTE, "\n");
735 return true;
738 /* Force the alignment of the decl.
739 NOTE: This is the only change to the code we make during
740 the analysis phase, before deciding to vectorize the loop. */
741 if (dump_enabled_p ())
743 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
744 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
745 dump_printf (MSG_NOTE, "\n");
748 DR_VECT_AUX (dr)->base_decl = base;
749 DR_VECT_AUX (dr)->base_misaligned = true;
750 DR_VECT_AUX (dr)->base_element_aligned = true;
753 /* If this is a backward running DR then first access in the larger
754 vectype actually is N-1 elements before the address in the DR.
755 Adjust misalign accordingly. */
756 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
758 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
759 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
760 otherwise we wouldn't be here. */
761 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
762 /* PLUS because DR_STEP was negative. */
763 misalign = size_binop (PLUS_EXPR, misalign, offset);
766 SET_DR_MISALIGNMENT (dr,
767 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
769 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
773 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
774 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
777 return true;
781 /* Function vect_compute_data_refs_alignment
783 Compute the misalignment of data references in the loop.
784 Return FALSE if a data reference is found that cannot be vectorized. */
786 static bool
787 vect_compute_data_refs_alignment (vec_info *vinfo)
789 vec<data_reference_p> datarefs = vinfo->datarefs;
790 struct data_reference *dr;
791 unsigned int i;
793 FOR_EACH_VEC_ELT (datarefs, i, dr)
794 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
795 && !vect_compute_data_ref_alignment (dr))
797 if (is_a <bb_vec_info> (vinfo))
799 /* Mark unsupported statement as unvectorizable. */
800 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
801 continue;
803 else
804 return false;
807 return true;
811 /* Function vect_update_misalignment_for_peel
813 DR - the data reference whose misalignment is to be adjusted.
814 DR_PEEL - the data reference whose misalignment is being made
815 zero in the vector loop by the peel.
816 NPEEL - the number of iterations in the peel loop if the misalignment
817 of DR_PEEL is known at compile time. */
819 static void
820 vect_update_misalignment_for_peel (struct data_reference *dr,
821 struct data_reference *dr_peel, int npeel)
823 unsigned int i;
824 vec<dr_p> same_align_drs;
825 struct data_reference *current_dr;
826 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
827 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
828 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
829 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
831 /* For interleaved data accesses the step in the loop must be multiplied by
832 the size of the interleaving group. */
833 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
834 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
835 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
836 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
838 /* It can be assumed that the data refs with the same alignment as dr_peel
839 are aligned in the vector loop. */
840 same_align_drs
841 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
842 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
844 if (current_dr != dr)
845 continue;
846 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
847 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
848 SET_DR_MISALIGNMENT (dr, 0);
849 return;
852 if (known_alignment_for_access_p (dr)
853 && known_alignment_for_access_p (dr_peel))
855 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
856 int misal = DR_MISALIGNMENT (dr);
857 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
858 misal += negative ? -npeel * dr_size : npeel * dr_size;
859 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
860 SET_DR_MISALIGNMENT (dr, misal);
861 return;
864 if (dump_enabled_p ())
865 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
866 SET_DR_MISALIGNMENT (dr, -1);
870 /* Function vect_verify_datarefs_alignment
872 Return TRUE if all data references in the loop can be
873 handled with respect to alignment. */
875 bool
876 vect_verify_datarefs_alignment (vec_info *vinfo)
878 vec<data_reference_p> datarefs = vinfo->datarefs;
879 struct data_reference *dr;
880 enum dr_alignment_support supportable_dr_alignment;
881 unsigned int i;
883 FOR_EACH_VEC_ELT (datarefs, i, dr)
885 gimple *stmt = DR_STMT (dr);
886 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
888 if (!STMT_VINFO_RELEVANT_P (stmt_info))
889 continue;
891 /* For interleaving, only the alignment of the first access matters.
892 Skip statements marked as not vectorizable. */
893 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
894 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
895 || !STMT_VINFO_VECTORIZABLE (stmt_info))
896 continue;
898 /* Strided accesses perform only component accesses, alignment is
899 irrelevant for them. */
900 if (STMT_VINFO_STRIDED_P (stmt_info)
901 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
902 continue;
904 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
905 if (!supportable_dr_alignment)
907 if (dump_enabled_p ())
909 if (DR_IS_READ (dr))
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "not vectorized: unsupported unaligned load.");
912 else
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "not vectorized: unsupported unaligned "
915 "store.");
917 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
918 DR_REF (dr));
919 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
921 return false;
923 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
924 dump_printf_loc (MSG_NOTE, vect_location,
925 "Vectorizing an unaligned access.\n");
927 return true;
930 /* Given an memory reference EXP return whether its alignment is less
931 than its size. */
933 static bool
934 not_size_aligned (tree exp)
936 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
937 return true;
939 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
940 > get_object_alignment (exp));
943 /* Function vector_alignment_reachable_p
945 Return true if vector alignment for DR is reachable by peeling
946 a few loop iterations. Return false otherwise. */
948 static bool
949 vector_alignment_reachable_p (struct data_reference *dr)
951 gimple *stmt = DR_STMT (dr);
952 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
953 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
955 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
957 /* For interleaved access we peel only if number of iterations in
958 the prolog loop ({VF - misalignment}), is a multiple of the
959 number of the interleaved accesses. */
960 int elem_size, mis_in_elements;
961 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
963 /* FORNOW: handle only known alignment. */
964 if (!known_alignment_for_access_p (dr))
965 return false;
967 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
968 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
970 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
971 return false;
974 /* If misalignment is known at the compile time then allow peeling
975 only if natural alignment is reachable through peeling. */
976 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
978 HOST_WIDE_INT elmsize =
979 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
980 if (dump_enabled_p ())
982 dump_printf_loc (MSG_NOTE, vect_location,
983 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
984 dump_printf (MSG_NOTE,
985 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
987 if (DR_MISALIGNMENT (dr) % elmsize)
989 if (dump_enabled_p ())
990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
991 "data size does not divide the misalignment.\n");
992 return false;
996 if (!known_alignment_for_access_p (dr))
998 tree type = TREE_TYPE (DR_REF (dr));
999 bool is_packed = not_size_aligned (DR_REF (dr));
1000 if (dump_enabled_p ())
1001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1002 "Unknown misalignment, is_packed = %d\n",is_packed);
1003 if ((TYPE_USER_ALIGN (type) && !is_packed)
1004 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1005 return true;
1006 else
1007 return false;
1010 return true;
1014 /* Calculate the cost of the memory access represented by DR. */
1016 static void
1017 vect_get_data_access_cost (struct data_reference *dr,
1018 unsigned int *inside_cost,
1019 unsigned int *outside_cost,
1020 stmt_vector_for_cost *body_cost_vec)
1022 gimple *stmt = DR_STMT (dr);
1023 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1024 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1025 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1026 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1027 int ncopies = vf / nunits;
1029 if (DR_IS_READ (dr))
1030 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1031 NULL, body_cost_vec, false);
1032 else
1033 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1035 if (dump_enabled_p ())
1036 dump_printf_loc (MSG_NOTE, vect_location,
1037 "vect_get_data_access_cost: inside_cost = %d, "
1038 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1042 /* Insert DR into peeling hash table with NPEEL as key. */
1044 static void
1045 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1046 int npeel)
1048 struct _vect_peel_info elem, *slot;
1049 _vect_peel_info **new_slot;
1050 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1052 elem.npeel = npeel;
1053 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1054 if (slot)
1055 slot->count++;
1056 else
1058 slot = XNEW (struct _vect_peel_info);
1059 slot->npeel = npeel;
1060 slot->dr = dr;
1061 slot->count = 1;
1062 new_slot
1063 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1064 *new_slot = slot;
1067 if (!supportable_dr_alignment
1068 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1069 slot->count += VECT_MAX_COST;
1073 /* Traverse peeling hash table to find peeling option that aligns maximum
1074 number of data accesses. */
1077 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1078 _vect_peel_extended_info *max)
1080 vect_peel_info elem = *slot;
1082 if (elem->count > max->peel_info.count
1083 || (elem->count == max->peel_info.count
1084 && max->peel_info.npeel > elem->npeel))
1086 max->peel_info.npeel = elem->npeel;
1087 max->peel_info.count = elem->count;
1088 max->peel_info.dr = elem->dr;
1091 return 1;
1095 /* Traverse peeling hash table and calculate cost for each peeling option.
1096 Find the one with the lowest cost. */
1099 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1100 _vect_peel_extended_info *min)
1102 vect_peel_info elem = *slot;
1103 int save_misalignment, dummy;
1104 unsigned int inside_cost = 0, outside_cost = 0, i;
1105 gimple *stmt = DR_STMT (elem->dr);
1106 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1107 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1108 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1109 struct data_reference *dr;
1110 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1112 prologue_cost_vec.create (2);
1113 body_cost_vec.create (2);
1114 epilogue_cost_vec.create (2);
1116 FOR_EACH_VEC_ELT (datarefs, i, dr)
1118 stmt = DR_STMT (dr);
1119 stmt_info = vinfo_for_stmt (stmt);
1120 /* For interleaving, only the alignment of the first access
1121 matters. */
1122 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1123 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1124 continue;
1126 save_misalignment = DR_MISALIGNMENT (dr);
1127 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1128 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1129 &body_cost_vec);
1130 SET_DR_MISALIGNMENT (dr, save_misalignment);
1133 outside_cost += vect_get_known_peeling_cost
1134 (loop_vinfo, elem->npeel, &dummy,
1135 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1136 &prologue_cost_vec, &epilogue_cost_vec);
1138 /* Prologue and epilogue costs are added to the target model later.
1139 These costs depend only on the scalar iteration cost, the
1140 number of peeling iterations finally chosen, and the number of
1141 misaligned statements. So discard the information found here. */
1142 prologue_cost_vec.release ();
1143 epilogue_cost_vec.release ();
1145 if (inside_cost < min->inside_cost
1146 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1148 min->inside_cost = inside_cost;
1149 min->outside_cost = outside_cost;
1150 min->body_cost_vec.release ();
1151 min->body_cost_vec = body_cost_vec;
1152 min->peel_info.dr = elem->dr;
1153 min->peel_info.npeel = elem->npeel;
1155 else
1156 body_cost_vec.release ();
1158 return 1;
1162 /* Choose best peeling option by traversing peeling hash table and either
1163 choosing an option with the lowest cost (if cost model is enabled) or the
1164 option that aligns as many accesses as possible. */
1166 static struct data_reference *
1167 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1168 unsigned int *npeel,
1169 stmt_vector_for_cost *body_cost_vec)
1171 struct _vect_peel_extended_info res;
1173 res.peel_info.dr = NULL;
1174 res.body_cost_vec = stmt_vector_for_cost ();
1176 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1178 res.inside_cost = INT_MAX;
1179 res.outside_cost = INT_MAX;
1180 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1181 ->traverse <_vect_peel_extended_info *,
1182 vect_peeling_hash_get_lowest_cost> (&res);
1184 else
1186 res.peel_info.count = 0;
1187 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1188 ->traverse <_vect_peel_extended_info *,
1189 vect_peeling_hash_get_most_frequent> (&res);
1192 *npeel = res.peel_info.npeel;
1193 *body_cost_vec = res.body_cost_vec;
1194 return res.peel_info.dr;
1198 /* Function vect_enhance_data_refs_alignment
1200 This pass will use loop versioning and loop peeling in order to enhance
1201 the alignment of data references in the loop.
1203 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1204 original loop is to be vectorized. Any other loops that are created by
1205 the transformations performed in this pass - are not supposed to be
1206 vectorized. This restriction will be relaxed.
1208 This pass will require a cost model to guide it whether to apply peeling
1209 or versioning or a combination of the two. For example, the scheme that
1210 intel uses when given a loop with several memory accesses, is as follows:
1211 choose one memory access ('p') which alignment you want to force by doing
1212 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1213 other accesses are not necessarily aligned, or (2) use loop versioning to
1214 generate one loop in which all accesses are aligned, and another loop in
1215 which only 'p' is necessarily aligned.
1217 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1218 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1219 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1221 Devising a cost model is the most critical aspect of this work. It will
1222 guide us on which access to peel for, whether to use loop versioning, how
1223 many versions to create, etc. The cost model will probably consist of
1224 generic considerations as well as target specific considerations (on
1225 powerpc for example, misaligned stores are more painful than misaligned
1226 loads).
1228 Here are the general steps involved in alignment enhancements:
1230 -- original loop, before alignment analysis:
1231 for (i=0; i<N; i++){
1232 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1233 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1236 -- After vect_compute_data_refs_alignment:
1237 for (i=0; i<N; i++){
1238 x = q[i]; # DR_MISALIGNMENT(q) = 3
1239 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1242 -- Possibility 1: we do loop versioning:
1243 if (p is aligned) {
1244 for (i=0; i<N; i++){ # loop 1A
1245 x = q[i]; # DR_MISALIGNMENT(q) = 3
1246 p[i] = y; # DR_MISALIGNMENT(p) = 0
1249 else {
1250 for (i=0; i<N; i++){ # loop 1B
1251 x = q[i]; # DR_MISALIGNMENT(q) = 3
1252 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1256 -- Possibility 2: we do loop peeling:
1257 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1258 x = q[i];
1259 p[i] = y;
1261 for (i = 3; i < N; i++){ # loop 2A
1262 x = q[i]; # DR_MISALIGNMENT(q) = 0
1263 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1266 -- Possibility 3: combination of loop peeling and versioning:
1267 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1268 x = q[i];
1269 p[i] = y;
1271 if (p is aligned) {
1272 for (i = 3; i<N; i++){ # loop 3A
1273 x = q[i]; # DR_MISALIGNMENT(q) = 0
1274 p[i] = y; # DR_MISALIGNMENT(p) = 0
1277 else {
1278 for (i = 3; i<N; i++){ # loop 3B
1279 x = q[i]; # DR_MISALIGNMENT(q) = 0
1280 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1284 These loops are later passed to loop_transform to be vectorized. The
1285 vectorizer will use the alignment information to guide the transformation
1286 (whether to generate regular loads/stores, or with special handling for
1287 misalignment). */
1289 bool
1290 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1292 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1293 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1294 enum dr_alignment_support supportable_dr_alignment;
1295 struct data_reference *dr0 = NULL, *first_store = NULL;
1296 struct data_reference *dr;
1297 unsigned int i, j;
1298 bool do_peeling = false;
1299 bool do_versioning = false;
1300 bool stat;
1301 gimple *stmt;
1302 stmt_vec_info stmt_info;
1303 unsigned int npeel = 0;
1304 bool all_misalignments_unknown = true;
1305 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1306 unsigned possible_npeel_number = 1;
1307 tree vectype;
1308 unsigned int nelements, mis, same_align_drs_max = 0;
1309 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1311 if (dump_enabled_p ())
1312 dump_printf_loc (MSG_NOTE, vect_location,
1313 "=== vect_enhance_data_refs_alignment ===\n");
1315 /* While cost model enhancements are expected in the future, the high level
1316 view of the code at this time is as follows:
1318 A) If there is a misaligned access then see if peeling to align
1319 this access can make all data references satisfy
1320 vect_supportable_dr_alignment. If so, update data structures
1321 as needed and return true.
1323 B) If peeling wasn't possible and there is a data reference with an
1324 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1325 then see if loop versioning checks can be used to make all data
1326 references satisfy vect_supportable_dr_alignment. If so, update
1327 data structures as needed and return true.
1329 C) If neither peeling nor versioning were successful then return false if
1330 any data reference does not satisfy vect_supportable_dr_alignment.
1332 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1334 Note, Possibility 3 above (which is peeling and versioning together) is not
1335 being done at this time. */
1337 /* (1) Peeling to force alignment. */
1339 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1340 Considerations:
1341 + How many accesses will become aligned due to the peeling
1342 - How many accesses will become unaligned due to the peeling,
1343 and the cost of misaligned accesses.
1344 - The cost of peeling (the extra runtime checks, the increase
1345 in code size). */
1347 FOR_EACH_VEC_ELT (datarefs, i, dr)
1349 stmt = DR_STMT (dr);
1350 stmt_info = vinfo_for_stmt (stmt);
1352 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1353 continue;
1355 /* For interleaving, only the alignment of the first access
1356 matters. */
1357 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1358 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1359 continue;
1361 /* For invariant accesses there is nothing to enhance. */
1362 if (integer_zerop (DR_STEP (dr)))
1363 continue;
1365 /* Strided accesses perform only component accesses, alignment is
1366 irrelevant for them. */
1367 if (STMT_VINFO_STRIDED_P (stmt_info)
1368 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1369 continue;
1371 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1372 do_peeling = vector_alignment_reachable_p (dr);
1373 if (do_peeling)
1375 if (known_alignment_for_access_p (dr))
1377 unsigned int npeel_tmp;
1378 bool negative = tree_int_cst_compare (DR_STEP (dr),
1379 size_zero_node) < 0;
1381 /* Save info about DR in the hash table. */
1382 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1383 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1384 = new hash_table<peel_info_hasher> (1);
1386 vectype = STMT_VINFO_VECTYPE (stmt_info);
1387 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1388 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1389 TREE_TYPE (DR_REF (dr))));
1390 npeel_tmp = (negative
1391 ? (mis - nelements) : (nelements - mis))
1392 & (nelements - 1);
1394 /* For multiple types, it is possible that the bigger type access
1395 will have more than one peeling option. E.g., a loop with two
1396 types: one of size (vector size / 4), and the other one of
1397 size (vector size / 8). Vectorization factor will 8. If both
1398 access are misaligned by 3, the first one needs one scalar
1399 iteration to be aligned, and the second one needs 5. But the
1400 the first one will be aligned also by peeling 5 scalar
1401 iterations, and in that case both accesses will be aligned.
1402 Hence, except for the immediate peeling amount, we also want
1403 to try to add full vector size, while we don't exceed
1404 vectorization factor.
1405 We do this automtically for cost model, since we calculate cost
1406 for every peeling option. */
1407 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1409 if (STMT_SLP_TYPE (stmt_info))
1410 possible_npeel_number
1411 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1412 else
1413 possible_npeel_number = vf / nelements;
1416 /* Handle the aligned case. We may decide to align some other
1417 access, making DR unaligned. */
1418 if (DR_MISALIGNMENT (dr) == 0)
1420 npeel_tmp = 0;
1421 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1422 possible_npeel_number++;
1425 for (j = 0; j < possible_npeel_number; j++)
1427 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1428 npeel_tmp += nelements;
1431 all_misalignments_unknown = false;
1432 /* Data-ref that was chosen for the case that all the
1433 misalignments are unknown is not relevant anymore, since we
1434 have a data-ref with known alignment. */
1435 dr0 = NULL;
1437 else
1439 /* If we don't know any misalignment values, we prefer
1440 peeling for data-ref that has the maximum number of data-refs
1441 with the same alignment, unless the target prefers to align
1442 stores over load. */
1443 if (all_misalignments_unknown)
1445 unsigned same_align_drs
1446 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1447 if (!dr0
1448 || same_align_drs_max < same_align_drs)
1450 same_align_drs_max = same_align_drs;
1451 dr0 = dr;
1453 /* For data-refs with the same number of related
1454 accesses prefer the one where the misalign
1455 computation will be invariant in the outermost loop. */
1456 else if (same_align_drs_max == same_align_drs)
1458 struct loop *ivloop0, *ivloop;
1459 ivloop0 = outermost_invariant_loop_for_expr
1460 (loop, DR_BASE_ADDRESS (dr0));
1461 ivloop = outermost_invariant_loop_for_expr
1462 (loop, DR_BASE_ADDRESS (dr));
1463 if ((ivloop && !ivloop0)
1464 || (ivloop && ivloop0
1465 && flow_loop_nested_p (ivloop, ivloop0)))
1466 dr0 = dr;
1469 if (!first_store && DR_IS_WRITE (dr))
1470 first_store = dr;
1473 /* If there are both known and unknown misaligned accesses in the
1474 loop, we choose peeling amount according to the known
1475 accesses. */
1476 if (!supportable_dr_alignment)
1478 dr0 = dr;
1479 if (!first_store && DR_IS_WRITE (dr))
1480 first_store = dr;
1484 else
1486 if (!aligned_access_p (dr))
1488 if (dump_enabled_p ())
1489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1490 "vector alignment may not be reachable\n");
1491 break;
1496 /* Check if we can possibly peel the loop. */
1497 if (!vect_can_advance_ivs_p (loop_vinfo)
1498 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1499 || loop->inner)
1500 do_peeling = false;
1502 if (do_peeling
1503 && all_misalignments_unknown
1504 && vect_supportable_dr_alignment (dr0, false))
1506 /* Check if the target requires to prefer stores over loads, i.e., if
1507 misaligned stores are more expensive than misaligned loads (taking
1508 drs with same alignment into account). */
1509 if (first_store && DR_IS_READ (dr0))
1511 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1512 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1513 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1514 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1515 stmt_vector_for_cost dummy;
1516 dummy.create (2);
1518 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1519 &dummy);
1520 vect_get_data_access_cost (first_store, &store_inside_cost,
1521 &store_outside_cost, &dummy);
1523 dummy.release ();
1525 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1526 aligning the load DR0). */
1527 load_inside_penalty = store_inside_cost;
1528 load_outside_penalty = store_outside_cost;
1529 for (i = 0;
1530 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1531 DR_STMT (first_store))).iterate (i, &dr);
1532 i++)
1533 if (DR_IS_READ (dr))
1535 load_inside_penalty += load_inside_cost;
1536 load_outside_penalty += load_outside_cost;
1538 else
1540 load_inside_penalty += store_inside_cost;
1541 load_outside_penalty += store_outside_cost;
1544 /* Calculate the penalty for leaving DR0 unaligned (by
1545 aligning the FIRST_STORE). */
1546 store_inside_penalty = load_inside_cost;
1547 store_outside_penalty = load_outside_cost;
1548 for (i = 0;
1549 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1550 DR_STMT (dr0))).iterate (i, &dr);
1551 i++)
1552 if (DR_IS_READ (dr))
1554 store_inside_penalty += load_inside_cost;
1555 store_outside_penalty += load_outside_cost;
1557 else
1559 store_inside_penalty += store_inside_cost;
1560 store_outside_penalty += store_outside_cost;
1563 if (load_inside_penalty > store_inside_penalty
1564 || (load_inside_penalty == store_inside_penalty
1565 && load_outside_penalty > store_outside_penalty))
1566 dr0 = first_store;
1569 /* In case there are only loads with different unknown misalignments, use
1570 peeling only if it may help to align other accesses in the loop or
1571 if it may help improving load bandwith when we'd end up using
1572 unaligned loads. */
1573 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1574 if (!first_store
1575 && !STMT_VINFO_SAME_ALIGN_REFS (
1576 vinfo_for_stmt (DR_STMT (dr0))).length ()
1577 && (vect_supportable_dr_alignment (dr0, false)
1578 != dr_unaligned_supported
1579 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1580 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1581 do_peeling = false;
1584 if (do_peeling && !dr0)
1586 /* Peeling is possible, but there is no data access that is not supported
1587 unless aligned. So we try to choose the best possible peeling. */
1589 /* We should get here only if there are drs with known misalignment. */
1590 gcc_assert (!all_misalignments_unknown);
1592 /* Choose the best peeling from the hash table. */
1593 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1594 &body_cost_vec);
1595 if (!dr0 || !npeel)
1596 do_peeling = false;
1599 if (do_peeling)
1601 stmt = DR_STMT (dr0);
1602 stmt_info = vinfo_for_stmt (stmt);
1603 vectype = STMT_VINFO_VECTYPE (stmt_info);
1604 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1606 if (known_alignment_for_access_p (dr0))
1608 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1609 size_zero_node) < 0;
1610 if (!npeel)
1612 /* Since it's known at compile time, compute the number of
1613 iterations in the peeled loop (the peeling factor) for use in
1614 updating DR_MISALIGNMENT values. The peeling factor is the
1615 vectorization factor minus the misalignment as an element
1616 count. */
1617 mis = DR_MISALIGNMENT (dr0);
1618 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1619 npeel = ((negative ? mis - nelements : nelements - mis)
1620 & (nelements - 1));
1623 /* For interleaved data access every iteration accesses all the
1624 members of the group, therefore we divide the number of iterations
1625 by the group size. */
1626 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1627 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1628 npeel /= GROUP_SIZE (stmt_info);
1630 if (dump_enabled_p ())
1631 dump_printf_loc (MSG_NOTE, vect_location,
1632 "Try peeling by %d\n", npeel);
1635 /* Ensure that all data refs can be vectorized after the peel. */
1636 FOR_EACH_VEC_ELT (datarefs, i, dr)
1638 int save_misalignment;
1640 if (dr == dr0)
1641 continue;
1643 stmt = DR_STMT (dr);
1644 stmt_info = vinfo_for_stmt (stmt);
1645 /* For interleaving, only the alignment of the first access
1646 matters. */
1647 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1648 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1649 continue;
1651 /* Strided accesses perform only component accesses, alignment is
1652 irrelevant for them. */
1653 if (STMT_VINFO_STRIDED_P (stmt_info)
1654 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1655 continue;
1657 save_misalignment = DR_MISALIGNMENT (dr);
1658 vect_update_misalignment_for_peel (dr, dr0, npeel);
1659 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1660 SET_DR_MISALIGNMENT (dr, save_misalignment);
1662 if (!supportable_dr_alignment)
1664 do_peeling = false;
1665 break;
1669 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1671 stat = vect_verify_datarefs_alignment (loop_vinfo);
1672 if (!stat)
1673 do_peeling = false;
1674 else
1676 body_cost_vec.release ();
1677 return stat;
1681 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1682 if (do_peeling)
1684 unsigned max_allowed_peel
1685 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1686 if (max_allowed_peel != (unsigned)-1)
1688 unsigned max_peel = npeel;
1689 if (max_peel == 0)
1691 gimple *dr_stmt = DR_STMT (dr0);
1692 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1693 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1694 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1696 if (max_peel > max_allowed_peel)
1698 do_peeling = false;
1699 if (dump_enabled_p ())
1700 dump_printf_loc (MSG_NOTE, vect_location,
1701 "Disable peeling, max peels reached: %d\n", max_peel);
1706 /* Cost model #2 - if peeling may result in a remaining loop not
1707 iterating enough to be vectorized then do not peel. */
1708 if (do_peeling
1709 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1711 unsigned max_peel
1712 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1713 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1714 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1715 do_peeling = false;
1718 if (do_peeling)
1720 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1721 If the misalignment of DR_i is identical to that of dr0 then set
1722 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1723 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1724 by the peeling factor times the element size of DR_i (MOD the
1725 vectorization factor times the size). Otherwise, the
1726 misalignment of DR_i must be set to unknown. */
1727 FOR_EACH_VEC_ELT (datarefs, i, dr)
1728 if (dr != dr0)
1729 vect_update_misalignment_for_peel (dr, dr0, npeel);
1731 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1732 if (npeel)
1733 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1734 else
1735 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1736 = DR_MISALIGNMENT (dr0);
1737 SET_DR_MISALIGNMENT (dr0, 0);
1738 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "Alignment of access forced using peeling.\n");
1742 dump_printf_loc (MSG_NOTE, vect_location,
1743 "Peeling for alignment will be applied.\n");
1745 /* The inside-loop cost will be accounted for in vectorizable_load
1746 and vectorizable_store correctly with adjusted alignments.
1747 Drop the body_cst_vec on the floor here. */
1748 body_cost_vec.release ();
1750 stat = vect_verify_datarefs_alignment (loop_vinfo);
1751 gcc_assert (stat);
1752 return stat;
1756 body_cost_vec.release ();
1758 /* (2) Versioning to force alignment. */
1760 /* Try versioning if:
1761 1) optimize loop for speed
1762 2) there is at least one unsupported misaligned data ref with an unknown
1763 misalignment, and
1764 3) all misaligned data refs with a known misalignment are supported, and
1765 4) the number of runtime alignment checks is within reason. */
1767 do_versioning =
1768 optimize_loop_nest_for_speed_p (loop)
1769 && (!loop->inner); /* FORNOW */
1771 if (do_versioning)
1773 FOR_EACH_VEC_ELT (datarefs, i, dr)
1775 stmt = DR_STMT (dr);
1776 stmt_info = vinfo_for_stmt (stmt);
1778 /* For interleaving, only the alignment of the first access
1779 matters. */
1780 if (aligned_access_p (dr)
1781 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1782 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1783 continue;
1785 if (STMT_VINFO_STRIDED_P (stmt_info))
1787 /* Strided loads perform only component accesses, alignment is
1788 irrelevant for them. */
1789 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1790 continue;
1791 do_versioning = false;
1792 break;
1795 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1797 if (!supportable_dr_alignment)
1799 gimple *stmt;
1800 int mask;
1801 tree vectype;
1803 if (known_alignment_for_access_p (dr)
1804 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1805 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1807 do_versioning = false;
1808 break;
1811 stmt = DR_STMT (dr);
1812 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1813 gcc_assert (vectype);
1815 /* The rightmost bits of an aligned address must be zeros.
1816 Construct the mask needed for this test. For example,
1817 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1818 mask must be 15 = 0xf. */
1819 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1821 /* FORNOW: use the same mask to test all potentially unaligned
1822 references in the loop. The vectorizer currently supports
1823 a single vector size, see the reference to
1824 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1825 vectorization factor is computed. */
1826 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1827 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1828 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1829 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1830 DR_STMT (dr));
1834 /* Versioning requires at least one misaligned data reference. */
1835 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1836 do_versioning = false;
1837 else if (!do_versioning)
1838 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1841 if (do_versioning)
1843 vec<gimple *> may_misalign_stmts
1844 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1845 gimple *stmt;
1847 /* It can now be assumed that the data references in the statements
1848 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1849 of the loop being vectorized. */
1850 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1852 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1853 dr = STMT_VINFO_DATA_REF (stmt_info);
1854 SET_DR_MISALIGNMENT (dr, 0);
1855 if (dump_enabled_p ())
1856 dump_printf_loc (MSG_NOTE, vect_location,
1857 "Alignment of access forced using versioning.\n");
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_NOTE, vect_location,
1862 "Versioning for alignment will be applied.\n");
1864 /* Peeling and versioning can't be done together at this time. */
1865 gcc_assert (! (do_peeling && do_versioning));
1867 stat = vect_verify_datarefs_alignment (loop_vinfo);
1868 gcc_assert (stat);
1869 return stat;
1872 /* This point is reached if neither peeling nor versioning is being done. */
1873 gcc_assert (! (do_peeling || do_versioning));
1875 stat = vect_verify_datarefs_alignment (loop_vinfo);
1876 return stat;
1880 /* Function vect_find_same_alignment_drs.
1882 Update group and alignment relations according to the chosen
1883 vectorization factor. */
1885 static void
1886 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1887 loop_vec_info loop_vinfo)
1889 unsigned int i;
1890 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1891 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1892 struct data_reference *dra = DDR_A (ddr);
1893 struct data_reference *drb = DDR_B (ddr);
1894 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1895 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1896 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1897 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1898 lambda_vector dist_v;
1899 unsigned int loop_depth;
1901 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1902 return;
1904 if (dra == drb)
1905 return;
1907 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1908 return;
1910 /* Loop-based vectorization and known data dependence. */
1911 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1912 return;
1914 /* Data-dependence analysis reports a distance vector of zero
1915 for data-references that overlap only in the first iteration
1916 but have different sign step (see PR45764).
1917 So as a sanity check require equal DR_STEP. */
1918 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1919 return;
1921 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1922 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1924 int dist = dist_v[loop_depth];
1926 if (dump_enabled_p ())
1927 dump_printf_loc (MSG_NOTE, vect_location,
1928 "dependence distance = %d.\n", dist);
1930 /* Same loop iteration. */
1931 if (dist == 0
1932 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1934 /* Two references with distance zero have the same alignment. */
1935 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1936 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1937 if (dump_enabled_p ())
1939 dump_printf_loc (MSG_NOTE, vect_location,
1940 "accesses have the same alignment.\n");
1941 dump_printf (MSG_NOTE,
1942 "dependence distance modulo vf == 0 between ");
1943 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1944 dump_printf (MSG_NOTE, " and ");
1945 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1946 dump_printf (MSG_NOTE, "\n");
1953 /* Function vect_analyze_data_refs_alignment
1955 Analyze the alignment of the data-references in the loop.
1956 Return FALSE if a data reference is found that cannot be vectorized. */
1958 bool
1959 vect_analyze_data_refs_alignment (vec_info *vinfo)
1961 if (dump_enabled_p ())
1962 dump_printf_loc (MSG_NOTE, vect_location,
1963 "=== vect_analyze_data_refs_alignment ===\n");
1965 /* Mark groups of data references with same alignment using
1966 data dependence information. */
1967 if (is_a <loop_vec_info> (vinfo))
1969 vec<ddr_p> ddrs = vinfo->ddrs;
1970 struct data_dependence_relation *ddr;
1971 unsigned int i;
1973 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1974 vect_find_same_alignment_drs (ddr, as_a <loop_vec_info> (vinfo));
1977 if (!vect_compute_data_refs_alignment (vinfo))
1979 if (dump_enabled_p ())
1980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981 "not vectorized: can't calculate alignment "
1982 "for data ref.\n");
1983 return false;
1986 return true;
1990 /* Analyze groups of accesses: check that DR belongs to a group of
1991 accesses of legal size, step, etc. Detect gaps, single element
1992 interleaving, and other special cases. Set grouped access info.
1993 Collect groups of strided stores for further use in SLP analysis.
1994 Worker for vect_analyze_group_access. */
1996 static bool
1997 vect_analyze_group_access_1 (struct data_reference *dr)
1999 tree step = DR_STEP (dr);
2000 tree scalar_type = TREE_TYPE (DR_REF (dr));
2001 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2002 gimple *stmt = DR_STMT (dr);
2003 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2004 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2005 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2006 HOST_WIDE_INT dr_step = -1;
2007 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2008 bool slp_impossible = false;
2009 struct loop *loop = NULL;
2011 if (loop_vinfo)
2012 loop = LOOP_VINFO_LOOP (loop_vinfo);
2014 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2015 size of the interleaving group (including gaps). */
2016 if (tree_fits_shwi_p (step))
2018 dr_step = tree_to_shwi (step);
2019 groupsize = absu_hwi (dr_step) / type_size;
2021 else
2022 groupsize = 0;
2024 /* Not consecutive access is possible only if it is a part of interleaving. */
2025 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2027 /* Check if it this DR is a part of interleaving, and is a single
2028 element of the group that is accessed in the loop. */
2030 /* Gaps are supported only for loads. STEP must be a multiple of the type
2031 size. The size of the group must be a power of 2. */
2032 if (DR_IS_READ (dr)
2033 && (dr_step % type_size) == 0
2034 && groupsize > 0
2035 && exact_log2 (groupsize) != -1)
2037 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2038 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2039 if (dump_enabled_p ())
2041 dump_printf_loc (MSG_NOTE, vect_location,
2042 "Detected single element interleaving ");
2043 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2044 dump_printf (MSG_NOTE, " step ");
2045 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2046 dump_printf (MSG_NOTE, "\n");
2049 if (loop_vinfo)
2051 if (dump_enabled_p ())
2052 dump_printf_loc (MSG_NOTE, vect_location,
2053 "Data access with gaps requires scalar "
2054 "epilogue loop\n");
2055 if (loop->inner)
2057 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059 "Peeling for outer loop is not"
2060 " supported\n");
2061 return false;
2064 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2067 return true;
2070 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2073 "not consecutive access ");
2074 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2075 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2078 if (bb_vinfo)
2080 /* Mark the statement as unvectorizable. */
2081 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2082 return true;
2085 return false;
2088 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2090 /* First stmt in the interleaving chain. Check the chain. */
2091 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2092 struct data_reference *data_ref = dr;
2093 unsigned int count = 1;
2094 tree prev_init = DR_INIT (data_ref);
2095 gimple *prev = stmt;
2096 HOST_WIDE_INT diff, gaps = 0;
2098 while (next)
2100 /* Skip same data-refs. In case that two or more stmts share
2101 data-ref (supported only for loads), we vectorize only the first
2102 stmt, and the rest get their vectorized loads from the first
2103 one. */
2104 if (!tree_int_cst_compare (DR_INIT (data_ref),
2105 DR_INIT (STMT_VINFO_DATA_REF (
2106 vinfo_for_stmt (next)))))
2108 if (DR_IS_WRITE (data_ref))
2110 if (dump_enabled_p ())
2111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2112 "Two store stmts share the same dr.\n");
2113 return false;
2116 /* For load use the same data-ref load. */
2117 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2119 prev = next;
2120 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2121 continue;
2124 prev = next;
2125 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2127 /* All group members have the same STEP by construction. */
2128 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2130 /* Check that the distance between two accesses is equal to the type
2131 size. Otherwise, we have gaps. */
2132 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2133 - TREE_INT_CST_LOW (prev_init)) / type_size;
2134 if (diff != 1)
2136 /* FORNOW: SLP of accesses with gaps is not supported. */
2137 slp_impossible = true;
2138 if (DR_IS_WRITE (data_ref))
2140 if (dump_enabled_p ())
2141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2142 "interleaved store with gaps\n");
2143 return false;
2146 gaps += diff - 1;
2149 last_accessed_element += diff;
2151 /* Store the gap from the previous member of the group. If there is no
2152 gap in the access, GROUP_GAP is always 1. */
2153 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2155 prev_init = DR_INIT (data_ref);
2156 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2157 /* Count the number of data-refs in the chain. */
2158 count++;
2161 if (groupsize == 0)
2162 groupsize = count + gaps;
2164 if (groupsize > UINT_MAX)
2166 if (dump_enabled_p ())
2167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168 "group is too large\n");
2169 return false;
2172 /* Check that the size of the interleaving is equal to count for stores,
2173 i.e., that there are no gaps. */
2174 if (groupsize != count
2175 && !DR_IS_READ (dr))
2177 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2179 "interleaved store with gaps\n");
2180 return false;
2183 /* If there is a gap after the last load in the group it is the
2184 difference between the groupsize and the last accessed
2185 element.
2186 When there is no gap, this difference should be 0. */
2187 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2189 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2190 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_NOTE, vect_location,
2193 "Detected interleaving ");
2194 if (DR_IS_READ (dr))
2195 dump_printf (MSG_NOTE, "load ");
2196 else
2197 dump_printf (MSG_NOTE, "store ");
2198 dump_printf (MSG_NOTE, "of size %u starting with ",
2199 (unsigned)groupsize);
2200 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2201 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2202 dump_printf_loc (MSG_NOTE, vect_location,
2203 "There is a gap of %u elements after the group\n",
2204 GROUP_GAP (vinfo_for_stmt (stmt)));
2207 /* SLP: create an SLP data structure for every interleaving group of
2208 stores for further analysis in vect_analyse_slp. */
2209 if (DR_IS_WRITE (dr) && !slp_impossible)
2211 if (loop_vinfo)
2212 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2213 if (bb_vinfo)
2214 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2217 /* If there is a gap in the end of the group or the group size cannot
2218 be made a multiple of the vector element count then we access excess
2219 elements in the last iteration and thus need to peel that off. */
2220 if (loop_vinfo
2221 && (groupsize - last_accessed_element > 0
2222 || exact_log2 (groupsize) == -1))
2225 if (dump_enabled_p ())
2226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2227 "Data access with gaps requires scalar "
2228 "epilogue loop\n");
2229 if (loop->inner)
2231 if (dump_enabled_p ())
2232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2233 "Peeling for outer loop is not supported\n");
2234 return false;
2237 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2241 return true;
2244 /* Analyze groups of accesses: check that DR belongs to a group of
2245 accesses of legal size, step, etc. Detect gaps, single element
2246 interleaving, and other special cases. Set grouped access info.
2247 Collect groups of strided stores for further use in SLP analysis. */
2249 static bool
2250 vect_analyze_group_access (struct data_reference *dr)
2252 if (!vect_analyze_group_access_1 (dr))
2254 /* Dissolve the group if present. */
2255 gimple *next;
2256 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2257 while (stmt)
2259 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2260 next = GROUP_NEXT_ELEMENT (vinfo);
2261 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2262 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2263 stmt = next;
2265 return false;
2267 return true;
2270 /* Analyze the access pattern of the data-reference DR.
2271 In case of non-consecutive accesses call vect_analyze_group_access() to
2272 analyze groups of accesses. */
2274 static bool
2275 vect_analyze_data_ref_access (struct data_reference *dr)
2277 tree step = DR_STEP (dr);
2278 tree scalar_type = TREE_TYPE (DR_REF (dr));
2279 gimple *stmt = DR_STMT (dr);
2280 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2281 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2282 struct loop *loop = NULL;
2284 if (loop_vinfo)
2285 loop = LOOP_VINFO_LOOP (loop_vinfo);
2287 if (loop_vinfo && !step)
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291 "bad data-ref access in loop\n");
2292 return false;
2295 /* Allow loads with zero step in inner-loop vectorization. */
2296 if (loop_vinfo && integer_zerop (step))
2298 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2299 if (!nested_in_vect_loop_p (loop, stmt))
2300 return DR_IS_READ (dr);
2301 /* Allow references with zero step for outer loops marked
2302 with pragma omp simd only - it guarantees absence of
2303 loop-carried dependencies between inner loop iterations. */
2304 if (!loop->force_vectorize)
2306 if (dump_enabled_p ())
2307 dump_printf_loc (MSG_NOTE, vect_location,
2308 "zero step in inner loop of nest\n");
2309 return false;
2313 if (loop && nested_in_vect_loop_p (loop, stmt))
2315 /* Interleaved accesses are not yet supported within outer-loop
2316 vectorization for references in the inner-loop. */
2317 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2319 /* For the rest of the analysis we use the outer-loop step. */
2320 step = STMT_VINFO_DR_STEP (stmt_info);
2321 if (integer_zerop (step))
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE, vect_location,
2325 "zero step in outer loop.\n");
2326 return DR_IS_READ (dr);
2330 /* Consecutive? */
2331 if (TREE_CODE (step) == INTEGER_CST)
2333 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2334 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2335 || (dr_step < 0
2336 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2338 /* Mark that it is not interleaving. */
2339 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2340 return true;
2344 if (loop && nested_in_vect_loop_p (loop, stmt))
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_NOTE, vect_location,
2348 "grouped access in outer loop.\n");
2349 return false;
2353 /* Assume this is a DR handled by non-constant strided load case. */
2354 if (TREE_CODE (step) != INTEGER_CST)
2355 return (STMT_VINFO_STRIDED_P (stmt_info)
2356 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2357 || vect_analyze_group_access (dr)));
2359 /* Not consecutive access - check if it's a part of interleaving group. */
2360 return vect_analyze_group_access (dr);
2365 /* A helper function used in the comparator function to sort data
2366 references. T1 and T2 are two data references to be compared.
2367 The function returns -1, 0, or 1. */
2369 static int
2370 compare_tree (tree t1, tree t2)
2372 int i, cmp;
2373 enum tree_code code;
2374 char tclass;
2376 if (t1 == t2)
2377 return 0;
2378 if (t1 == NULL)
2379 return -1;
2380 if (t2 == NULL)
2381 return 1;
2384 if (TREE_CODE (t1) != TREE_CODE (t2))
2385 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2387 code = TREE_CODE (t1);
2388 switch (code)
2390 /* For const values, we can just use hash values for comparisons. */
2391 case INTEGER_CST:
2392 case REAL_CST:
2393 case FIXED_CST:
2394 case STRING_CST:
2395 case COMPLEX_CST:
2396 case VECTOR_CST:
2398 hashval_t h1 = iterative_hash_expr (t1, 0);
2399 hashval_t h2 = iterative_hash_expr (t2, 0);
2400 if (h1 != h2)
2401 return h1 < h2 ? -1 : 1;
2402 break;
2405 case SSA_NAME:
2406 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2407 if (cmp != 0)
2408 return cmp;
2410 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2411 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2412 break;
2414 default:
2415 tclass = TREE_CODE_CLASS (code);
2417 /* For var-decl, we could compare their UIDs. */
2418 if (tclass == tcc_declaration)
2420 if (DECL_UID (t1) != DECL_UID (t2))
2421 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2422 break;
2425 /* For expressions with operands, compare their operands recursively. */
2426 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2428 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2429 if (cmp != 0)
2430 return cmp;
2434 return 0;
2438 /* Compare two data-references DRA and DRB to group them into chunks
2439 suitable for grouping. */
2441 static int
2442 dr_group_sort_cmp (const void *dra_, const void *drb_)
2444 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2445 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2446 int cmp;
2448 /* Stabilize sort. */
2449 if (dra == drb)
2450 return 0;
2452 /* Ordering of DRs according to base. */
2453 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2455 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2456 if (cmp != 0)
2457 return cmp;
2460 /* And according to DR_OFFSET. */
2461 if (!dr_equal_offsets_p (dra, drb))
2463 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2464 if (cmp != 0)
2465 return cmp;
2468 /* Put reads before writes. */
2469 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2470 return DR_IS_READ (dra) ? -1 : 1;
2472 /* Then sort after access size. */
2473 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2474 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2476 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2477 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2478 if (cmp != 0)
2479 return cmp;
2482 /* And after step. */
2483 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2485 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2486 if (cmp != 0)
2487 return cmp;
2490 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2491 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2492 if (cmp == 0)
2493 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2494 return cmp;
2497 /* Function vect_analyze_data_ref_accesses.
2499 Analyze the access pattern of all the data references in the loop.
2501 FORNOW: the only access pattern that is considered vectorizable is a
2502 simple step 1 (consecutive) access.
2504 FORNOW: handle only arrays and pointer accesses. */
2506 bool
2507 vect_analyze_data_ref_accesses (vec_info *vinfo)
2509 unsigned int i;
2510 vec<data_reference_p> datarefs = vinfo->datarefs;
2511 struct data_reference *dr;
2513 if (dump_enabled_p ())
2514 dump_printf_loc (MSG_NOTE, vect_location,
2515 "=== vect_analyze_data_ref_accesses ===\n");
2517 if (datarefs.is_empty ())
2518 return true;
2520 /* Sort the array of datarefs to make building the interleaving chains
2521 linear. Don't modify the original vector's order, it is needed for
2522 determining what dependencies are reversed. */
2523 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2524 datarefs_copy.qsort (dr_group_sort_cmp);
2526 /* Build the interleaving chains. */
2527 for (i = 0; i < datarefs_copy.length () - 1;)
2529 data_reference_p dra = datarefs_copy[i];
2530 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2531 stmt_vec_info lastinfo = NULL;
2532 for (i = i + 1; i < datarefs_copy.length (); ++i)
2534 data_reference_p drb = datarefs_copy[i];
2535 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2537 /* ??? Imperfect sorting (non-compatible types, non-modulo
2538 accesses, same accesses) can lead to a group to be artificially
2539 split here as we don't just skip over those. If it really
2540 matters we can push those to a worklist and re-iterate
2541 over them. The we can just skip ahead to the next DR here. */
2543 /* Check that the data-refs have same first location (except init)
2544 and they are both either store or load (not load and store,
2545 not masked loads or stores). */
2546 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2547 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2548 DR_BASE_ADDRESS (drb), 0)
2549 || !dr_equal_offsets_p (dra, drb)
2550 || !gimple_assign_single_p (DR_STMT (dra))
2551 || !gimple_assign_single_p (DR_STMT (drb)))
2552 break;
2554 /* Check that the data-refs have the same constant size. */
2555 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2556 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2557 if (!tree_fits_uhwi_p (sza)
2558 || !tree_fits_uhwi_p (szb)
2559 || !tree_int_cst_equal (sza, szb))
2560 break;
2562 /* Check that the data-refs have the same step. */
2563 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2564 break;
2566 /* Do not place the same access in the interleaving chain twice. */
2567 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2568 break;
2570 /* Check the types are compatible.
2571 ??? We don't distinguish this during sorting. */
2572 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2573 TREE_TYPE (DR_REF (drb))))
2574 break;
2576 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2577 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2578 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2579 gcc_assert (init_a < init_b);
2581 /* If init_b == init_a + the size of the type * k, we have an
2582 interleaving, and DRA is accessed before DRB. */
2583 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2584 if ((init_b - init_a) % type_size_a != 0)
2585 break;
2587 /* If we have a store, the accesses are adjacent. This splits
2588 groups into chunks we support (we don't support vectorization
2589 of stores with gaps). */
2590 if (!DR_IS_READ (dra)
2591 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2592 (DR_INIT (datarefs_copy[i-1]))
2593 != type_size_a))
2594 break;
2596 /* If the step (if not zero or non-constant) is greater than the
2597 difference between data-refs' inits this splits groups into
2598 suitable sizes. */
2599 if (tree_fits_shwi_p (DR_STEP (dra)))
2601 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2602 if (step != 0 && step <= (init_b - init_a))
2603 break;
2606 if (dump_enabled_p ())
2608 dump_printf_loc (MSG_NOTE, vect_location,
2609 "Detected interleaving ");
2610 if (DR_IS_READ (dra))
2611 dump_printf (MSG_NOTE, "load ");
2612 else
2613 dump_printf (MSG_NOTE, "store ");
2614 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2615 dump_printf (MSG_NOTE, " and ");
2616 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2617 dump_printf (MSG_NOTE, "\n");
2620 /* Link the found element into the group list. */
2621 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2623 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2624 lastinfo = stmtinfo_a;
2626 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2627 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2628 lastinfo = stmtinfo_b;
2632 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2633 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2634 && !vect_analyze_data_ref_access (dr))
2636 if (dump_enabled_p ())
2637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2638 "not vectorized: complicated access pattern.\n");
2640 if (is_a <bb_vec_info> (vinfo))
2642 /* Mark the statement as not vectorizable. */
2643 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2644 continue;
2646 else
2648 datarefs_copy.release ();
2649 return false;
2653 datarefs_copy.release ();
2654 return true;
2658 /* Operator == between two dr_with_seg_len objects.
2660 This equality operator is used to make sure two data refs
2661 are the same one so that we will consider to combine the
2662 aliasing checks of those two pairs of data dependent data
2663 refs. */
2665 static bool
2666 operator == (const dr_with_seg_len& d1,
2667 const dr_with_seg_len& d2)
2669 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2670 DR_BASE_ADDRESS (d2.dr), 0)
2671 && compare_tree (d1.offset, d2.offset) == 0
2672 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2675 /* Function comp_dr_with_seg_len_pair.
2677 Comparison function for sorting objects of dr_with_seg_len_pair_t
2678 so that we can combine aliasing checks in one scan. */
2680 static int
2681 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2683 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2684 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2686 const dr_with_seg_len &p11 = p1->first,
2687 &p12 = p1->second,
2688 &p21 = p2->first,
2689 &p22 = p2->second;
2691 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2692 if a and c have the same basic address snd step, and b and d have the same
2693 address and step. Therefore, if any a&c or b&d don't have the same address
2694 and step, we don't care the order of those two pairs after sorting. */
2695 int comp_res;
2697 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2698 DR_BASE_ADDRESS (p21.dr))) != 0)
2699 return comp_res;
2700 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2701 DR_BASE_ADDRESS (p22.dr))) != 0)
2702 return comp_res;
2703 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2704 return comp_res;
2705 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2706 return comp_res;
2707 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2708 return comp_res;
2709 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2710 return comp_res;
2712 return 0;
2715 /* Function vect_vfa_segment_size.
2717 Create an expression that computes the size of segment
2718 that will be accessed for a data reference. The functions takes into
2719 account that realignment loads may access one more vector.
2721 Input:
2722 DR: The data reference.
2723 LENGTH_FACTOR: segment length to consider.
2725 Return an expression whose value is the size of segment which will be
2726 accessed by DR. */
2728 static tree
2729 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2731 tree segment_length;
2733 if (integer_zerop (DR_STEP (dr)))
2734 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2735 else
2736 segment_length = size_binop (MULT_EXPR,
2737 fold_convert (sizetype, DR_STEP (dr)),
2738 fold_convert (sizetype, length_factor));
2740 if (vect_supportable_dr_alignment (dr, false)
2741 == dr_explicit_realign_optimized)
2743 tree vector_size = TYPE_SIZE_UNIT
2744 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2746 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2748 return segment_length;
2751 /* Function vect_prune_runtime_alias_test_list.
2753 Prune a list of ddrs to be tested at run-time by versioning for alias.
2754 Merge several alias checks into one if possible.
2755 Return FALSE if resulting list of ddrs is longer then allowed by
2756 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2758 bool
2759 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2761 vec<ddr_p> may_alias_ddrs =
2762 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2763 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2764 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2765 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2766 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2768 ddr_p ddr;
2769 unsigned int i;
2770 tree length_factor;
2772 if (dump_enabled_p ())
2773 dump_printf_loc (MSG_NOTE, vect_location,
2774 "=== vect_prune_runtime_alias_test_list ===\n");
2776 if (may_alias_ddrs.is_empty ())
2777 return true;
2779 /* Basically, for each pair of dependent data refs store_ptr_0
2780 and load_ptr_0, we create an expression:
2782 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2783 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2785 for aliasing checks. However, in some cases we can decrease
2786 the number of checks by combining two checks into one. For
2787 example, suppose we have another pair of data refs store_ptr_0
2788 and load_ptr_1, and if the following condition is satisfied:
2790 load_ptr_0 < load_ptr_1 &&
2791 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2793 (this condition means, in each iteration of vectorized loop,
2794 the accessed memory of store_ptr_0 cannot be between the memory
2795 of load_ptr_0 and load_ptr_1.)
2797 we then can use only the following expression to finish the
2798 alising checks between store_ptr_0 & load_ptr_0 and
2799 store_ptr_0 & load_ptr_1:
2801 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2802 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2804 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2805 same basic address. */
2807 comp_alias_ddrs.create (may_alias_ddrs.length ());
2809 /* First, we collect all data ref pairs for aliasing checks. */
2810 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2812 struct data_reference *dr_a, *dr_b;
2813 gimple *dr_group_first_a, *dr_group_first_b;
2814 tree segment_length_a, segment_length_b;
2815 gimple *stmt_a, *stmt_b;
2817 dr_a = DDR_A (ddr);
2818 stmt_a = DR_STMT (DDR_A (ddr));
2819 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2820 if (dr_group_first_a)
2822 stmt_a = dr_group_first_a;
2823 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2826 dr_b = DDR_B (ddr);
2827 stmt_b = DR_STMT (DDR_B (ddr));
2828 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2829 if (dr_group_first_b)
2831 stmt_b = dr_group_first_b;
2832 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2835 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2836 length_factor = scalar_loop_iters;
2837 else
2838 length_factor = size_int (vect_factor);
2839 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2840 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2842 dr_with_seg_len_pair_t dr_with_seg_len_pair
2843 (dr_with_seg_len (dr_a, segment_length_a),
2844 dr_with_seg_len (dr_b, segment_length_b));
2846 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2847 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2849 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2852 /* Second, we sort the collected data ref pairs so that we can scan
2853 them once to combine all possible aliasing checks. */
2854 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2856 /* Third, we scan the sorted dr pairs and check if we can combine
2857 alias checks of two neighbouring dr pairs. */
2858 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2860 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2861 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2862 *dr_b1 = &comp_alias_ddrs[i-1].second,
2863 *dr_a2 = &comp_alias_ddrs[i].first,
2864 *dr_b2 = &comp_alias_ddrs[i].second;
2866 /* Remove duplicate data ref pairs. */
2867 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2869 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_NOTE, vect_location,
2872 "found equal ranges ");
2873 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2874 DR_REF (dr_a1->dr));
2875 dump_printf (MSG_NOTE, ", ");
2876 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2877 DR_REF (dr_b1->dr));
2878 dump_printf (MSG_NOTE, " and ");
2879 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2880 DR_REF (dr_a2->dr));
2881 dump_printf (MSG_NOTE, ", ");
2882 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2883 DR_REF (dr_b2->dr));
2884 dump_printf (MSG_NOTE, "\n");
2887 comp_alias_ddrs.ordered_remove (i--);
2888 continue;
2891 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2893 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2894 and DR_A1 and DR_A2 are two consecutive memrefs. */
2895 if (*dr_a1 == *dr_a2)
2897 std::swap (dr_a1, dr_b1);
2898 std::swap (dr_a2, dr_b2);
2901 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2902 DR_BASE_ADDRESS (dr_a2->dr),
2904 || !tree_fits_shwi_p (dr_a1->offset)
2905 || !tree_fits_shwi_p (dr_a2->offset))
2906 continue;
2908 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2909 - tree_to_shwi (dr_a1->offset));
2912 /* Now we check if the following condition is satisfied:
2914 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2916 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2917 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2918 have to make a best estimation. We can get the minimum value
2919 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2920 then either of the following two conditions can guarantee the
2921 one above:
2923 1: DIFF <= MIN_SEG_LEN_B
2924 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2928 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2929 ? tree_to_shwi (dr_b1->seg_len)
2930 : vect_factor);
2932 if (diff <= min_seg_len_b
2933 || (tree_fits_shwi_p (dr_a1->seg_len)
2934 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2936 if (dump_enabled_p ())
2938 dump_printf_loc (MSG_NOTE, vect_location,
2939 "merging ranges for ");
2940 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2941 DR_REF (dr_a1->dr));
2942 dump_printf (MSG_NOTE, ", ");
2943 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2944 DR_REF (dr_b1->dr));
2945 dump_printf (MSG_NOTE, " and ");
2946 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2947 DR_REF (dr_a2->dr));
2948 dump_printf (MSG_NOTE, ", ");
2949 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2950 DR_REF (dr_b2->dr));
2951 dump_printf (MSG_NOTE, "\n");
2954 dr_a1->seg_len = size_binop (PLUS_EXPR,
2955 dr_a2->seg_len, size_int (diff));
2956 comp_alias_ddrs.ordered_remove (i--);
2961 dump_printf_loc (MSG_NOTE, vect_location,
2962 "improved number of alias checks from %d to %d\n",
2963 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2964 if ((int) comp_alias_ddrs.length () >
2965 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2966 return false;
2968 return true;
2971 /* Check whether a non-affine read or write in stmt is suitable for gather load
2972 or scatter store and if so, return a builtin decl for that operation. */
2974 tree
2975 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
2976 tree *offp, int *scalep)
2978 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2979 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2980 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2981 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2982 tree offtype = NULL_TREE;
2983 tree decl, base, off;
2984 machine_mode pmode;
2985 int punsignedp, pvolatilep;
2987 base = DR_REF (dr);
2988 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2989 see if we can use the def stmt of the address. */
2990 if (is_gimple_call (stmt)
2991 && gimple_call_internal_p (stmt)
2992 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2993 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2994 && TREE_CODE (base) == MEM_REF
2995 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2996 && integer_zerop (TREE_OPERAND (base, 1))
2997 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2999 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3000 if (is_gimple_assign (def_stmt)
3001 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3002 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3005 /* The gather and scatter builtins need address of the form
3006 loop_invariant + vector * {1, 2, 4, 8}
3008 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3009 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3010 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3011 multiplications and additions in it. To get a vector, we need
3012 a single SSA_NAME that will be defined in the loop and will
3013 contain everything that is not loop invariant and that can be
3014 vectorized. The following code attempts to find such a preexistng
3015 SSA_NAME OFF and put the loop invariants into a tree BASE
3016 that can be gimplified before the loop. */
3017 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3018 &pmode, &punsignedp, &pvolatilep, false);
3019 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3021 if (TREE_CODE (base) == MEM_REF)
3023 if (!integer_zerop (TREE_OPERAND (base, 1)))
3025 if (off == NULL_TREE)
3027 offset_int moff = mem_ref_offset (base);
3028 off = wide_int_to_tree (sizetype, moff);
3030 else
3031 off = size_binop (PLUS_EXPR, off,
3032 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3034 base = TREE_OPERAND (base, 0);
3036 else
3037 base = build_fold_addr_expr (base);
3039 if (off == NULL_TREE)
3040 off = size_zero_node;
3042 /* If base is not loop invariant, either off is 0, then we start with just
3043 the constant offset in the loop invariant BASE and continue with base
3044 as OFF, otherwise give up.
3045 We could handle that case by gimplifying the addition of base + off
3046 into some SSA_NAME and use that as off, but for now punt. */
3047 if (!expr_invariant_in_loop_p (loop, base))
3049 if (!integer_zerop (off))
3050 return NULL_TREE;
3051 off = base;
3052 base = size_int (pbitpos / BITS_PER_UNIT);
3054 /* Otherwise put base + constant offset into the loop invariant BASE
3055 and continue with OFF. */
3056 else
3058 base = fold_convert (sizetype, base);
3059 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3062 /* OFF at this point may be either a SSA_NAME or some tree expression
3063 from get_inner_reference. Try to peel off loop invariants from it
3064 into BASE as long as possible. */
3065 STRIP_NOPS (off);
3066 while (offtype == NULL_TREE)
3068 enum tree_code code;
3069 tree op0, op1, add = NULL_TREE;
3071 if (TREE_CODE (off) == SSA_NAME)
3073 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3075 if (expr_invariant_in_loop_p (loop, off))
3076 return NULL_TREE;
3078 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3079 break;
3081 op0 = gimple_assign_rhs1 (def_stmt);
3082 code = gimple_assign_rhs_code (def_stmt);
3083 op1 = gimple_assign_rhs2 (def_stmt);
3085 else
3087 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3088 return NULL_TREE;
3089 code = TREE_CODE (off);
3090 extract_ops_from_tree (off, &code, &op0, &op1);
3092 switch (code)
3094 case POINTER_PLUS_EXPR:
3095 case PLUS_EXPR:
3096 if (expr_invariant_in_loop_p (loop, op0))
3098 add = op0;
3099 off = op1;
3100 do_add:
3101 add = fold_convert (sizetype, add);
3102 if (scale != 1)
3103 add = size_binop (MULT_EXPR, add, size_int (scale));
3104 base = size_binop (PLUS_EXPR, base, add);
3105 continue;
3107 if (expr_invariant_in_loop_p (loop, op1))
3109 add = op1;
3110 off = op0;
3111 goto do_add;
3113 break;
3114 case MINUS_EXPR:
3115 if (expr_invariant_in_loop_p (loop, op1))
3117 add = fold_convert (sizetype, op1);
3118 add = size_binop (MINUS_EXPR, size_zero_node, add);
3119 off = op0;
3120 goto do_add;
3122 break;
3123 case MULT_EXPR:
3124 if (scale == 1 && tree_fits_shwi_p (op1))
3126 scale = tree_to_shwi (op1);
3127 off = op0;
3128 continue;
3130 break;
3131 case SSA_NAME:
3132 off = op0;
3133 continue;
3134 CASE_CONVERT:
3135 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3136 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3137 break;
3138 if (TYPE_PRECISION (TREE_TYPE (op0))
3139 == TYPE_PRECISION (TREE_TYPE (off)))
3141 off = op0;
3142 continue;
3144 if (TYPE_PRECISION (TREE_TYPE (op0))
3145 < TYPE_PRECISION (TREE_TYPE (off)))
3147 off = op0;
3148 offtype = TREE_TYPE (off);
3149 STRIP_NOPS (off);
3150 continue;
3152 break;
3153 default:
3154 break;
3156 break;
3159 /* If at the end OFF still isn't a SSA_NAME or isn't
3160 defined in the loop, punt. */
3161 if (TREE_CODE (off) != SSA_NAME
3162 || expr_invariant_in_loop_p (loop, off))
3163 return NULL_TREE;
3165 if (offtype == NULL_TREE)
3166 offtype = TREE_TYPE (off);
3168 if (DR_IS_READ (dr))
3169 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3170 offtype, scale);
3171 else
3172 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3173 offtype, scale);
3175 if (decl == NULL_TREE)
3176 return NULL_TREE;
3178 if (basep)
3179 *basep = base;
3180 if (offp)
3181 *offp = off;
3182 if (scalep)
3183 *scalep = scale;
3184 return decl;
3187 /* Function vect_analyze_data_refs.
3189 Find all the data references in the loop or basic block.
3191 The general structure of the analysis of data refs in the vectorizer is as
3192 follows:
3193 1- vect_analyze_data_refs(loop/bb): call
3194 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3195 in the loop/bb and their dependences.
3196 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3197 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3198 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3202 bool
3203 vect_analyze_data_refs (vec_info *vinfo, int *min_vf, unsigned *n_stmts)
3205 struct loop *loop = NULL;
3206 basic_block bb = NULL;
3207 unsigned int i;
3208 vec<data_reference_p> datarefs;
3209 struct data_reference *dr;
3210 tree scalar_type;
3212 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_NOTE, vect_location,
3214 "=== vect_analyze_data_refs ===\n");
3216 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3218 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3220 loop = LOOP_VINFO_LOOP (loop_vinfo);
3221 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3222 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3224 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3226 "not vectorized: loop contains function calls"
3227 " or data references that cannot be analyzed\n");
3228 return false;
3231 for (i = 0; i < loop->num_nodes; i++)
3233 gimple_stmt_iterator gsi;
3235 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3237 gimple *stmt = gsi_stmt (gsi);
3238 if (is_gimple_debug (stmt))
3239 continue;
3240 ++*n_stmts;
3241 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3243 if (is_gimple_call (stmt) && loop->safelen)
3245 tree fndecl = gimple_call_fndecl (stmt), op;
3246 if (fndecl != NULL_TREE)
3248 struct cgraph_node *node = cgraph_node::get (fndecl);
3249 if (node != NULL && node->simd_clones != NULL)
3251 unsigned int j, n = gimple_call_num_args (stmt);
3252 for (j = 0; j < n; j++)
3254 op = gimple_call_arg (stmt, j);
3255 if (DECL_P (op)
3256 || (REFERENCE_CLASS_P (op)
3257 && get_base_address (op)))
3258 break;
3260 op = gimple_call_lhs (stmt);
3261 /* Ignore #pragma omp declare simd functions
3262 if they don't have data references in the
3263 call stmt itself. */
3264 if (j == n
3265 && !(op
3266 && (DECL_P (op)
3267 || (REFERENCE_CLASS_P (op)
3268 && get_base_address (op)))))
3269 continue;
3273 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3274 if (dump_enabled_p ())
3275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3276 "not vectorized: loop contains function "
3277 "calls or data references that cannot "
3278 "be analyzed\n");
3279 return false;
3284 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3286 else
3288 bb_vec_info bb_vinfo = as_a <bb_vec_info> (vinfo);
3289 gimple_stmt_iterator gsi;
3291 bb = BB_VINFO_BB (bb_vinfo);
3292 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3294 gimple *stmt = gsi_stmt (gsi);
3295 if (is_gimple_debug (stmt))
3296 continue;
3297 ++*n_stmts;
3298 if (!find_data_references_in_stmt (NULL, stmt,
3299 &BB_VINFO_DATAREFS (bb_vinfo)))
3301 /* Mark the rest of the basic-block as unvectorizable. */
3302 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3304 stmt = gsi_stmt (gsi);
3305 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3307 break;
3311 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3314 /* Go through the data-refs, check that the analysis succeeded. Update
3315 pointer from stmt_vec_info struct to DR and vectype. */
3317 FOR_EACH_VEC_ELT (datarefs, i, dr)
3319 gimple *stmt;
3320 stmt_vec_info stmt_info;
3321 tree base, offset, init;
3322 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3323 bool simd_lane_access = false;
3324 int vf;
3326 again:
3327 if (!dr || !DR_REF (dr))
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3331 "not vectorized: unhandled data-ref\n");
3332 return false;
3335 stmt = DR_STMT (dr);
3336 stmt_info = vinfo_for_stmt (stmt);
3338 /* Discard clobbers from the dataref vector. We will remove
3339 clobber stmts during vectorization. */
3340 if (gimple_clobber_p (stmt))
3342 free_data_ref (dr);
3343 if (i == datarefs.length () - 1)
3345 datarefs.pop ();
3346 break;
3348 datarefs.ordered_remove (i);
3349 dr = datarefs[i];
3350 goto again;
3353 /* Check that analysis of the data-ref succeeded. */
3354 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3355 || !DR_STEP (dr))
3357 bool maybe_gather
3358 = DR_IS_READ (dr)
3359 && !TREE_THIS_VOLATILE (DR_REF (dr))
3360 && targetm.vectorize.builtin_gather != NULL;
3361 bool maybe_scatter
3362 = DR_IS_WRITE (dr)
3363 && !TREE_THIS_VOLATILE (DR_REF (dr))
3364 && targetm.vectorize.builtin_scatter != NULL;
3365 bool maybe_simd_lane_access
3366 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3368 /* If target supports vector gather loads or scatter stores, or if
3369 this might be a SIMD lane access, see if they can't be used. */
3370 if (is_a <loop_vec_info> (vinfo)
3371 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3372 && !nested_in_vect_loop_p (loop, stmt))
3374 struct data_reference *newdr
3375 = create_data_ref (NULL, loop_containing_stmt (stmt),
3376 DR_REF (dr), stmt, maybe_scatter ? false : true);
3377 gcc_assert (newdr != NULL && DR_REF (newdr));
3378 if (DR_BASE_ADDRESS (newdr)
3379 && DR_OFFSET (newdr)
3380 && DR_INIT (newdr)
3381 && DR_STEP (newdr)
3382 && integer_zerop (DR_STEP (newdr)))
3384 if (maybe_simd_lane_access)
3386 tree off = DR_OFFSET (newdr);
3387 STRIP_NOPS (off);
3388 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3389 && TREE_CODE (off) == MULT_EXPR
3390 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3392 tree step = TREE_OPERAND (off, 1);
3393 off = TREE_OPERAND (off, 0);
3394 STRIP_NOPS (off);
3395 if (CONVERT_EXPR_P (off)
3396 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3397 0)))
3398 < TYPE_PRECISION (TREE_TYPE (off)))
3399 off = TREE_OPERAND (off, 0);
3400 if (TREE_CODE (off) == SSA_NAME)
3402 gimple *def = SSA_NAME_DEF_STMT (off);
3403 tree reft = TREE_TYPE (DR_REF (newdr));
3404 if (is_gimple_call (def)
3405 && gimple_call_internal_p (def)
3406 && (gimple_call_internal_fn (def)
3407 == IFN_GOMP_SIMD_LANE))
3409 tree arg = gimple_call_arg (def, 0);
3410 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3411 arg = SSA_NAME_VAR (arg);
3412 if (arg == loop->simduid
3413 /* For now. */
3414 && tree_int_cst_equal
3415 (TYPE_SIZE_UNIT (reft),
3416 step))
3418 DR_OFFSET (newdr) = ssize_int (0);
3419 DR_STEP (newdr) = step;
3420 DR_ALIGNED_TO (newdr)
3421 = size_int (BIGGEST_ALIGNMENT);
3422 dr = newdr;
3423 simd_lane_access = true;
3429 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3431 dr = newdr;
3432 if (maybe_gather)
3433 gatherscatter = GATHER;
3434 else
3435 gatherscatter = SCATTER;
3438 if (gatherscatter == SG_NONE && !simd_lane_access)
3439 free_data_ref (newdr);
3442 if (gatherscatter == SG_NONE && !simd_lane_access)
3444 if (dump_enabled_p ())
3446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3447 "not vectorized: data ref analysis "
3448 "failed ");
3449 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3450 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3453 if (is_a <bb_vec_info> (vinfo))
3454 break;
3456 return false;
3460 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3462 if (dump_enabled_p ())
3463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3464 "not vectorized: base addr of dr is a "
3465 "constant\n");
3467 if (is_a <bb_vec_info> (vinfo))
3468 break;
3470 if (gatherscatter != SG_NONE || simd_lane_access)
3471 free_data_ref (dr);
3472 return false;
3475 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3477 if (dump_enabled_p ())
3479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3480 "not vectorized: volatile type ");
3481 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3482 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3485 if (is_a <bb_vec_info> (vinfo))
3486 break;
3488 return false;
3491 if (stmt_can_throw_internal (stmt))
3493 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3496 "not vectorized: statement can throw an "
3497 "exception ");
3498 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3499 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3502 if (is_a <bb_vec_info> (vinfo))
3503 break;
3505 if (gatherscatter != SG_NONE || simd_lane_access)
3506 free_data_ref (dr);
3507 return false;
3510 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3511 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3513 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3516 "not vectorized: statement is bitfield "
3517 "access ");
3518 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3519 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3522 if (is_a <bb_vec_info> (vinfo))
3523 break;
3525 if (gatherscatter != SG_NONE || simd_lane_access)
3526 free_data_ref (dr);
3527 return false;
3530 base = unshare_expr (DR_BASE_ADDRESS (dr));
3531 offset = unshare_expr (DR_OFFSET (dr));
3532 init = unshare_expr (DR_INIT (dr));
3534 if (is_gimple_call (stmt)
3535 && (!gimple_call_internal_p (stmt)
3536 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3537 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3539 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3542 "not vectorized: dr in a call ");
3543 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3544 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3547 if (is_a <bb_vec_info> (vinfo))
3548 break;
3550 if (gatherscatter != SG_NONE || simd_lane_access)
3551 free_data_ref (dr);
3552 return false;
3555 /* Update DR field in stmt_vec_info struct. */
3557 /* If the dataref is in an inner-loop of the loop that is considered for
3558 for vectorization, we also want to analyze the access relative to
3559 the outer-loop (DR contains information only relative to the
3560 inner-most enclosing loop). We do that by building a reference to the
3561 first location accessed by the inner-loop, and analyze it relative to
3562 the outer-loop. */
3563 if (loop && nested_in_vect_loop_p (loop, stmt))
3565 tree outer_step, outer_base, outer_init;
3566 HOST_WIDE_INT pbitsize, pbitpos;
3567 tree poffset;
3568 machine_mode pmode;
3569 int punsignedp, pvolatilep;
3570 affine_iv base_iv, offset_iv;
3571 tree dinit;
3573 /* Build a reference to the first location accessed by the
3574 inner-loop: *(BASE+INIT). (The first location is actually
3575 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3576 tree inner_base = build_fold_indirect_ref
3577 (fold_build_pointer_plus (base, init));
3579 if (dump_enabled_p ())
3581 dump_printf_loc (MSG_NOTE, vect_location,
3582 "analyze in outer-loop: ");
3583 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3584 dump_printf (MSG_NOTE, "\n");
3587 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3588 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3589 gcc_assert (outer_base != NULL_TREE);
3591 if (pbitpos % BITS_PER_UNIT != 0)
3593 if (dump_enabled_p ())
3594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3595 "failed: bit offset alignment.\n");
3596 return false;
3599 outer_base = build_fold_addr_expr (outer_base);
3600 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3601 &base_iv, false))
3603 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3605 "failed: evolution of base is not affine.\n");
3606 return false;
3609 if (offset)
3611 if (poffset)
3612 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3613 poffset);
3614 else
3615 poffset = offset;
3618 if (!poffset)
3620 offset_iv.base = ssize_int (0);
3621 offset_iv.step = ssize_int (0);
3623 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3624 &offset_iv, false))
3626 if (dump_enabled_p ())
3627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3628 "evolution of offset is not affine.\n");
3629 return false;
3632 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3633 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3634 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3635 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3636 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3638 outer_step = size_binop (PLUS_EXPR,
3639 fold_convert (ssizetype, base_iv.step),
3640 fold_convert (ssizetype, offset_iv.step));
3642 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3643 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3644 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3645 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3646 STMT_VINFO_DR_OFFSET (stmt_info) =
3647 fold_convert (ssizetype, offset_iv.base);
3648 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3649 size_int (highest_pow2_factor (offset_iv.base));
3651 if (dump_enabled_p ())
3653 dump_printf_loc (MSG_NOTE, vect_location,
3654 "\touter base_address: ");
3655 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3656 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3657 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3658 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3659 STMT_VINFO_DR_OFFSET (stmt_info));
3660 dump_printf (MSG_NOTE,
3661 "\n\touter constant offset from base address: ");
3662 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3663 STMT_VINFO_DR_INIT (stmt_info));
3664 dump_printf (MSG_NOTE, "\n\touter step: ");
3665 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3666 STMT_VINFO_DR_STEP (stmt_info));
3667 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3668 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3669 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3670 dump_printf (MSG_NOTE, "\n");
3674 if (STMT_VINFO_DATA_REF (stmt_info))
3676 if (dump_enabled_p ())
3678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3679 "not vectorized: more than one data ref "
3680 "in stmt: ");
3681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3682 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3685 if (is_a <bb_vec_info> (vinfo))
3686 break;
3688 if (gatherscatter != SG_NONE || simd_lane_access)
3689 free_data_ref (dr);
3690 return false;
3693 STMT_VINFO_DATA_REF (stmt_info) = dr;
3694 if (simd_lane_access)
3696 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3697 free_data_ref (datarefs[i]);
3698 datarefs[i] = dr;
3701 /* Set vectype for STMT. */
3702 scalar_type = TREE_TYPE (DR_REF (dr));
3703 STMT_VINFO_VECTYPE (stmt_info)
3704 = get_vectype_for_scalar_type (scalar_type);
3705 if (!STMT_VINFO_VECTYPE (stmt_info))
3707 if (dump_enabled_p ())
3709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3710 "not vectorized: no vectype for stmt: ");
3711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3712 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3713 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3714 scalar_type);
3715 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3718 if (is_a <bb_vec_info> (vinfo))
3719 break;
3721 if (gatherscatter != SG_NONE || simd_lane_access)
3723 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3724 if (gatherscatter != SG_NONE)
3725 free_data_ref (dr);
3727 return false;
3729 else
3731 if (dump_enabled_p ())
3733 dump_printf_loc (MSG_NOTE, vect_location,
3734 "got vectype for stmt: ");
3735 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3736 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3737 STMT_VINFO_VECTYPE (stmt_info));
3738 dump_printf (MSG_NOTE, "\n");
3742 /* Adjust the minimal vectorization factor according to the
3743 vector type. */
3744 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3745 if (vf > *min_vf)
3746 *min_vf = vf;
3748 if (gatherscatter != SG_NONE)
3750 tree off;
3751 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3752 NULL, &off, NULL)
3753 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3755 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3756 free_data_ref (dr);
3757 if (dump_enabled_p ())
3759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3760 (gatherscatter == GATHER) ?
3761 "not vectorized: not suitable for gather "
3762 "load " :
3763 "not vectorized: not suitable for scatter "
3764 "store ");
3765 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3766 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3768 return false;
3771 datarefs[i] = dr;
3772 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3775 else if (is_a <loop_vec_info> (vinfo)
3776 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3778 if (nested_in_vect_loop_p (loop, stmt))
3780 if (dump_enabled_p ())
3782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3783 "not vectorized: not suitable for strided "
3784 "load ");
3785 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3786 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3788 return false;
3790 STMT_VINFO_STRIDED_P (stmt_info) = true;
3794 /* If we stopped analysis at the first dataref we could not analyze
3795 when trying to vectorize a basic-block mark the rest of the datarefs
3796 as not vectorizable and truncate the vector of datarefs. That
3797 avoids spending useless time in analyzing their dependence. */
3798 if (i != datarefs.length ())
3800 gcc_assert (is_a <bb_vec_info> (vinfo));
3801 for (unsigned j = i; j < datarefs.length (); ++j)
3803 data_reference_p dr = datarefs[j];
3804 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3805 free_data_ref (dr);
3807 datarefs.truncate (i);
3810 return true;
3814 /* Function vect_get_new_vect_var.
3816 Returns a name for a new variable. The current naming scheme appends the
3817 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3818 the name of vectorizer generated variables, and appends that to NAME if
3819 provided. */
3821 tree
3822 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3824 const char *prefix;
3825 tree new_vect_var;
3827 switch (var_kind)
3829 case vect_simple_var:
3830 prefix = "vect";
3831 break;
3832 case vect_scalar_var:
3833 prefix = "stmp";
3834 break;
3835 case vect_pointer_var:
3836 prefix = "vectp";
3837 break;
3838 default:
3839 gcc_unreachable ();
3842 if (name)
3844 char* tmp = concat (prefix, "_", name, NULL);
3845 new_vect_var = create_tmp_reg (type, tmp);
3846 free (tmp);
3848 else
3849 new_vect_var = create_tmp_reg (type, prefix);
3851 return new_vect_var;
3854 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3856 static void
3857 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3858 stmt_vec_info stmt_info)
3860 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3861 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3862 int misalign = DR_MISALIGNMENT (dr);
3863 if (misalign == -1)
3864 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3865 else
3866 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3869 /* Function vect_create_addr_base_for_vector_ref.
3871 Create an expression that computes the address of the first memory location
3872 that will be accessed for a data reference.
3874 Input:
3875 STMT: The statement containing the data reference.
3876 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3877 OFFSET: Optional. If supplied, it is be added to the initial address.
3878 LOOP: Specify relative to which loop-nest should the address be computed.
3879 For example, when the dataref is in an inner-loop nested in an
3880 outer-loop that is now being vectorized, LOOP can be either the
3881 outer-loop, or the inner-loop. The first memory location accessed
3882 by the following dataref ('in' points to short):
3884 for (i=0; i<N; i++)
3885 for (j=0; j<M; j++)
3886 s += in[i+j]
3888 is as follows:
3889 if LOOP=i_loop: &in (relative to i_loop)
3890 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3891 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3892 initial address. Unlike OFFSET, which is number of elements to
3893 be added, BYTE_OFFSET is measured in bytes.
3895 Output:
3896 1. Return an SSA_NAME whose value is the address of the memory location of
3897 the first vector of the data reference.
3898 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3899 these statement(s) which define the returned SSA_NAME.
3901 FORNOW: We are only handling array accesses with step 1. */
3903 tree
3904 vect_create_addr_base_for_vector_ref (gimple *stmt,
3905 gimple_seq *new_stmt_list,
3906 tree offset,
3907 struct loop *loop,
3908 tree byte_offset)
3910 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3911 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3912 tree data_ref_base;
3913 const char *base_name;
3914 tree addr_base;
3915 tree dest;
3916 gimple_seq seq = NULL;
3917 tree base_offset;
3918 tree init;
3919 tree vect_ptr_type;
3920 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3921 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3923 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3925 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3927 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3929 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3930 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3931 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3933 else
3935 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3936 base_offset = unshare_expr (DR_OFFSET (dr));
3937 init = unshare_expr (DR_INIT (dr));
3940 if (loop_vinfo)
3941 base_name = get_name (data_ref_base);
3942 else
3944 base_offset = ssize_int (0);
3945 init = ssize_int (0);
3946 base_name = get_name (DR_REF (dr));
3949 /* Create base_offset */
3950 base_offset = size_binop (PLUS_EXPR,
3951 fold_convert (sizetype, base_offset),
3952 fold_convert (sizetype, init));
3954 if (offset)
3956 offset = fold_build2 (MULT_EXPR, sizetype,
3957 fold_convert (sizetype, offset), step);
3958 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3959 base_offset, offset);
3961 if (byte_offset)
3963 byte_offset = fold_convert (sizetype, byte_offset);
3964 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3965 base_offset, byte_offset);
3968 /* base + base_offset */
3969 if (loop_vinfo)
3970 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3971 else
3973 addr_base = build1 (ADDR_EXPR,
3974 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3975 unshare_expr (DR_REF (dr)));
3978 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3979 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3980 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3981 gimple_seq_add_seq (new_stmt_list, seq);
3983 if (DR_PTR_INFO (dr)
3984 && TREE_CODE (addr_base) == SSA_NAME
3985 && !SSA_NAME_PTR_INFO (addr_base))
3987 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3988 if (offset || byte_offset)
3989 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3992 if (dump_enabled_p ())
3994 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3995 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3996 dump_printf (MSG_NOTE, "\n");
3999 return addr_base;
4003 /* Function vect_create_data_ref_ptr.
4005 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4006 location accessed in the loop by STMT, along with the def-use update
4007 chain to appropriately advance the pointer through the loop iterations.
4008 Also set aliasing information for the pointer. This pointer is used by
4009 the callers to this function to create a memory reference expression for
4010 vector load/store access.
4012 Input:
4013 1. STMT: a stmt that references memory. Expected to be of the form
4014 GIMPLE_ASSIGN <name, data-ref> or
4015 GIMPLE_ASSIGN <data-ref, name>.
4016 2. AGGR_TYPE: the type of the reference, which should be either a vector
4017 or an array.
4018 3. AT_LOOP: the loop where the vector memref is to be created.
4019 4. OFFSET (optional): an offset to be added to the initial address accessed
4020 by the data-ref in STMT.
4021 5. BSI: location where the new stmts are to be placed if there is no loop
4022 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4023 pointing to the initial address.
4024 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4025 to the initial address accessed by the data-ref in STMT. This is
4026 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4027 in bytes.
4029 Output:
4030 1. Declare a new ptr to vector_type, and have it point to the base of the
4031 data reference (initial addressed accessed by the data reference).
4032 For example, for vector of type V8HI, the following code is generated:
4034 v8hi *ap;
4035 ap = (v8hi *)initial_address;
4037 if OFFSET is not supplied:
4038 initial_address = &a[init];
4039 if OFFSET is supplied:
4040 initial_address = &a[init + OFFSET];
4041 if BYTE_OFFSET is supplied:
4042 initial_address = &a[init] + BYTE_OFFSET;
4044 Return the initial_address in INITIAL_ADDRESS.
4046 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4047 update the pointer in each iteration of the loop.
4049 Return the increment stmt that updates the pointer in PTR_INCR.
4051 3. Set INV_P to true if the access pattern of the data reference in the
4052 vectorized loop is invariant. Set it to false otherwise.
4054 4. Return the pointer. */
4056 tree
4057 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4058 tree offset, tree *initial_address,
4059 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4060 bool only_init, bool *inv_p, tree byte_offset)
4062 const char *base_name;
4063 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4064 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4065 struct loop *loop = NULL;
4066 bool nested_in_vect_loop = false;
4067 struct loop *containing_loop = NULL;
4068 tree aggr_ptr_type;
4069 tree aggr_ptr;
4070 tree new_temp;
4071 gimple_seq new_stmt_list = NULL;
4072 edge pe = NULL;
4073 basic_block new_bb;
4074 tree aggr_ptr_init;
4075 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4076 tree aptr;
4077 gimple_stmt_iterator incr_gsi;
4078 bool insert_after;
4079 tree indx_before_incr, indx_after_incr;
4080 gimple *incr;
4081 tree step;
4082 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4084 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4085 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4087 if (loop_vinfo)
4089 loop = LOOP_VINFO_LOOP (loop_vinfo);
4090 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4091 containing_loop = (gimple_bb (stmt))->loop_father;
4092 pe = loop_preheader_edge (loop);
4094 else
4096 gcc_assert (bb_vinfo);
4097 only_init = true;
4098 *ptr_incr = NULL;
4101 /* Check the step (evolution) of the load in LOOP, and record
4102 whether it's invariant. */
4103 if (nested_in_vect_loop)
4104 step = STMT_VINFO_DR_STEP (stmt_info);
4105 else
4106 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4108 if (integer_zerop (step))
4109 *inv_p = true;
4110 else
4111 *inv_p = false;
4113 /* Create an expression for the first address accessed by this load
4114 in LOOP. */
4115 base_name = get_name (DR_BASE_ADDRESS (dr));
4117 if (dump_enabled_p ())
4119 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4120 dump_printf_loc (MSG_NOTE, vect_location,
4121 "create %s-pointer variable to type: ",
4122 get_tree_code_name (TREE_CODE (aggr_type)));
4123 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4124 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4125 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4126 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4127 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4128 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4129 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4130 else
4131 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4132 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4133 dump_printf (MSG_NOTE, "\n");
4136 /* (1) Create the new aggregate-pointer variable.
4137 Vector and array types inherit the alias set of their component
4138 type by default so we need to use a ref-all pointer if the data
4139 reference does not conflict with the created aggregated data
4140 reference because it is not addressable. */
4141 bool need_ref_all = false;
4142 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4143 get_alias_set (DR_REF (dr))))
4144 need_ref_all = true;
4145 /* Likewise for any of the data references in the stmt group. */
4146 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4148 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4151 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4152 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4153 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4154 get_alias_set (DR_REF (sdr))))
4156 need_ref_all = true;
4157 break;
4159 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4161 while (orig_stmt);
4163 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4164 need_ref_all);
4165 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4168 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4169 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4170 def-use update cycles for the pointer: one relative to the outer-loop
4171 (LOOP), which is what steps (3) and (4) below do. The other is relative
4172 to the inner-loop (which is the inner-most loop containing the dataref),
4173 and this is done be step (5) below.
4175 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4176 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4177 redundant. Steps (3),(4) create the following:
4179 vp0 = &base_addr;
4180 LOOP: vp1 = phi(vp0,vp2)
4183 vp2 = vp1 + step
4184 goto LOOP
4186 If there is an inner-loop nested in loop, then step (5) will also be
4187 applied, and an additional update in the inner-loop will be created:
4189 vp0 = &base_addr;
4190 LOOP: vp1 = phi(vp0,vp2)
4192 inner: vp3 = phi(vp1,vp4)
4193 vp4 = vp3 + inner_step
4194 if () goto inner
4196 vp2 = vp1 + step
4197 if () goto LOOP */
4199 /* (2) Calculate the initial address of the aggregate-pointer, and set
4200 the aggregate-pointer to point to it before the loop. */
4202 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4204 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4205 offset, loop, byte_offset);
4206 if (new_stmt_list)
4208 if (pe)
4210 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4211 gcc_assert (!new_bb);
4213 else
4214 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4217 *initial_address = new_temp;
4218 aggr_ptr_init = new_temp;
4220 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4221 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4222 inner-loop nested in LOOP (during outer-loop vectorization). */
4224 /* No update in loop is required. */
4225 if (only_init && (!loop_vinfo || at_loop == loop))
4226 aptr = aggr_ptr_init;
4227 else
4229 /* The step of the aggregate pointer is the type size. */
4230 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4231 /* One exception to the above is when the scalar step of the load in
4232 LOOP is zero. In this case the step here is also zero. */
4233 if (*inv_p)
4234 iv_step = size_zero_node;
4235 else if (tree_int_cst_sgn (step) == -1)
4236 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4238 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4240 create_iv (aggr_ptr_init,
4241 fold_convert (aggr_ptr_type, iv_step),
4242 aggr_ptr, loop, &incr_gsi, insert_after,
4243 &indx_before_incr, &indx_after_incr);
4244 incr = gsi_stmt (incr_gsi);
4245 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4247 /* Copy the points-to information if it exists. */
4248 if (DR_PTR_INFO (dr))
4250 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4251 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4253 if (ptr_incr)
4254 *ptr_incr = incr;
4256 aptr = indx_before_incr;
4259 if (!nested_in_vect_loop || only_init)
4260 return aptr;
4263 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4264 nested in LOOP, if exists. */
4266 gcc_assert (nested_in_vect_loop);
4267 if (!only_init)
4269 standard_iv_increment_position (containing_loop, &incr_gsi,
4270 &insert_after);
4271 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4272 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4273 &indx_after_incr);
4274 incr = gsi_stmt (incr_gsi);
4275 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4277 /* Copy the points-to information if it exists. */
4278 if (DR_PTR_INFO (dr))
4280 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4281 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4283 if (ptr_incr)
4284 *ptr_incr = incr;
4286 return indx_before_incr;
4288 else
4289 gcc_unreachable ();
4293 /* Function bump_vector_ptr
4295 Increment a pointer (to a vector type) by vector-size. If requested,
4296 i.e. if PTR-INCR is given, then also connect the new increment stmt
4297 to the existing def-use update-chain of the pointer, by modifying
4298 the PTR_INCR as illustrated below:
4300 The pointer def-use update-chain before this function:
4301 DATAREF_PTR = phi (p_0, p_2)
4302 ....
4303 PTR_INCR: p_2 = DATAREF_PTR + step
4305 The pointer def-use update-chain after this function:
4306 DATAREF_PTR = phi (p_0, p_2)
4307 ....
4308 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4309 ....
4310 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4312 Input:
4313 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4314 in the loop.
4315 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4316 the loop. The increment amount across iterations is expected
4317 to be vector_size.
4318 BSI - location where the new update stmt is to be placed.
4319 STMT - the original scalar memory-access stmt that is being vectorized.
4320 BUMP - optional. The offset by which to bump the pointer. If not given,
4321 the offset is assumed to be vector_size.
4323 Output: Return NEW_DATAREF_PTR as illustrated above.
4327 tree
4328 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4329 gimple *stmt, tree bump)
4331 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4332 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4333 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4334 tree update = TYPE_SIZE_UNIT (vectype);
4335 gassign *incr_stmt;
4336 ssa_op_iter iter;
4337 use_operand_p use_p;
4338 tree new_dataref_ptr;
4340 if (bump)
4341 update = bump;
4343 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4344 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4345 else
4346 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4347 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4348 dataref_ptr, update);
4349 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4351 /* Copy the points-to information if it exists. */
4352 if (DR_PTR_INFO (dr))
4354 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4355 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4358 if (!ptr_incr)
4359 return new_dataref_ptr;
4361 /* Update the vector-pointer's cross-iteration increment. */
4362 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4364 tree use = USE_FROM_PTR (use_p);
4366 if (use == dataref_ptr)
4367 SET_USE (use_p, new_dataref_ptr);
4368 else
4369 gcc_assert (tree_int_cst_compare (use, update) == 0);
4372 return new_dataref_ptr;
4376 /* Function vect_create_destination_var.
4378 Create a new temporary of type VECTYPE. */
4380 tree
4381 vect_create_destination_var (tree scalar_dest, tree vectype)
4383 tree vec_dest;
4384 const char *name;
4385 char *new_name;
4386 tree type;
4387 enum vect_var_kind kind;
4389 kind = vectype ? vect_simple_var : vect_scalar_var;
4390 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4392 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4394 name = get_name (scalar_dest);
4395 if (name)
4396 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4397 else
4398 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4399 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4400 free (new_name);
4402 return vec_dest;
4405 /* Function vect_grouped_store_supported.
4407 Returns TRUE if interleave high and interleave low permutations
4408 are supported, and FALSE otherwise. */
4410 bool
4411 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4413 machine_mode mode = TYPE_MODE (vectype);
4415 /* vect_permute_store_chain requires the group size to be equal to 3 or
4416 be a power of two. */
4417 if (count != 3 && exact_log2 (count) == -1)
4419 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4421 "the size of the group of accesses"
4422 " is not a power of 2 or not eqaul to 3\n");
4423 return false;
4426 /* Check that the permutation is supported. */
4427 if (VECTOR_MODE_P (mode))
4429 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4430 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4432 if (count == 3)
4434 unsigned int j0 = 0, j1 = 0, j2 = 0;
4435 unsigned int i, j;
4437 for (j = 0; j < 3; j++)
4439 int nelt0 = ((3 - j) * nelt) % 3;
4440 int nelt1 = ((3 - j) * nelt + 1) % 3;
4441 int nelt2 = ((3 - j) * nelt + 2) % 3;
4442 for (i = 0; i < nelt; i++)
4444 if (3 * i + nelt0 < nelt)
4445 sel[3 * i + nelt0] = j0++;
4446 if (3 * i + nelt1 < nelt)
4447 sel[3 * i + nelt1] = nelt + j1++;
4448 if (3 * i + nelt2 < nelt)
4449 sel[3 * i + nelt2] = 0;
4451 if (!can_vec_perm_p (mode, false, sel))
4453 if (dump_enabled_p ())
4454 dump_printf (MSG_MISSED_OPTIMIZATION,
4455 "permutaion op not supported by target.\n");
4456 return false;
4459 for (i = 0; i < nelt; i++)
4461 if (3 * i + nelt0 < nelt)
4462 sel[3 * i + nelt0] = 3 * i + nelt0;
4463 if (3 * i + nelt1 < nelt)
4464 sel[3 * i + nelt1] = 3 * i + nelt1;
4465 if (3 * i + nelt2 < nelt)
4466 sel[3 * i + nelt2] = nelt + j2++;
4468 if (!can_vec_perm_p (mode, false, sel))
4470 if (dump_enabled_p ())
4471 dump_printf (MSG_MISSED_OPTIMIZATION,
4472 "permutaion op not supported by target.\n");
4473 return false;
4476 return true;
4478 else
4480 /* If length is not equal to 3 then only power of 2 is supported. */
4481 gcc_assert (exact_log2 (count) != -1);
4483 for (i = 0; i < nelt / 2; i++)
4485 sel[i * 2] = i;
4486 sel[i * 2 + 1] = i + nelt;
4488 if (can_vec_perm_p (mode, false, sel))
4490 for (i = 0; i < nelt; i++)
4491 sel[i] += nelt / 2;
4492 if (can_vec_perm_p (mode, false, sel))
4493 return true;
4498 if (dump_enabled_p ())
4499 dump_printf (MSG_MISSED_OPTIMIZATION,
4500 "permutaion op not supported by target.\n");
4501 return false;
4505 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4506 type VECTYPE. */
4508 bool
4509 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4511 return vect_lanes_optab_supported_p ("vec_store_lanes",
4512 vec_store_lanes_optab,
4513 vectype, count);
4517 /* Function vect_permute_store_chain.
4519 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4520 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4521 the data correctly for the stores. Return the final references for stores
4522 in RESULT_CHAIN.
4524 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4525 The input is 4 vectors each containing 8 elements. We assign a number to
4526 each element, the input sequence is:
4528 1st vec: 0 1 2 3 4 5 6 7
4529 2nd vec: 8 9 10 11 12 13 14 15
4530 3rd vec: 16 17 18 19 20 21 22 23
4531 4th vec: 24 25 26 27 28 29 30 31
4533 The output sequence should be:
4535 1st vec: 0 8 16 24 1 9 17 25
4536 2nd vec: 2 10 18 26 3 11 19 27
4537 3rd vec: 4 12 20 28 5 13 21 30
4538 4th vec: 6 14 22 30 7 15 23 31
4540 i.e., we interleave the contents of the four vectors in their order.
4542 We use interleave_high/low instructions to create such output. The input of
4543 each interleave_high/low operation is two vectors:
4544 1st vec 2nd vec
4545 0 1 2 3 4 5 6 7
4546 the even elements of the result vector are obtained left-to-right from the
4547 high/low elements of the first vector. The odd elements of the result are
4548 obtained left-to-right from the high/low elements of the second vector.
4549 The output of interleave_high will be: 0 4 1 5
4550 and of interleave_low: 2 6 3 7
4553 The permutation is done in log LENGTH stages. In each stage interleave_high
4554 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4555 where the first argument is taken from the first half of DR_CHAIN and the
4556 second argument from it's second half.
4557 In our example,
4559 I1: interleave_high (1st vec, 3rd vec)
4560 I2: interleave_low (1st vec, 3rd vec)
4561 I3: interleave_high (2nd vec, 4th vec)
4562 I4: interleave_low (2nd vec, 4th vec)
4564 The output for the first stage is:
4566 I1: 0 16 1 17 2 18 3 19
4567 I2: 4 20 5 21 6 22 7 23
4568 I3: 8 24 9 25 10 26 11 27
4569 I4: 12 28 13 29 14 30 15 31
4571 The output of the second stage, i.e. the final result is:
4573 I1: 0 8 16 24 1 9 17 25
4574 I2: 2 10 18 26 3 11 19 27
4575 I3: 4 12 20 28 5 13 21 30
4576 I4: 6 14 22 30 7 15 23 31. */
4578 void
4579 vect_permute_store_chain (vec<tree> dr_chain,
4580 unsigned int length,
4581 gimple *stmt,
4582 gimple_stmt_iterator *gsi,
4583 vec<tree> *result_chain)
4585 tree vect1, vect2, high, low;
4586 gimple *perm_stmt;
4587 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4588 tree perm_mask_low, perm_mask_high;
4589 tree data_ref;
4590 tree perm3_mask_low, perm3_mask_high;
4591 unsigned int i, n, log_length = exact_log2 (length);
4592 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4593 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4595 result_chain->quick_grow (length);
4596 memcpy (result_chain->address (), dr_chain.address (),
4597 length * sizeof (tree));
4599 if (length == 3)
4601 unsigned int j0 = 0, j1 = 0, j2 = 0;
4603 for (j = 0; j < 3; j++)
4605 int nelt0 = ((3 - j) * nelt) % 3;
4606 int nelt1 = ((3 - j) * nelt + 1) % 3;
4607 int nelt2 = ((3 - j) * nelt + 2) % 3;
4609 for (i = 0; i < nelt; i++)
4611 if (3 * i + nelt0 < nelt)
4612 sel[3 * i + nelt0] = j0++;
4613 if (3 * i + nelt1 < nelt)
4614 sel[3 * i + nelt1] = nelt + j1++;
4615 if (3 * i + nelt2 < nelt)
4616 sel[3 * i + nelt2] = 0;
4618 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4620 for (i = 0; i < nelt; i++)
4622 if (3 * i + nelt0 < nelt)
4623 sel[3 * i + nelt0] = 3 * i + nelt0;
4624 if (3 * i + nelt1 < nelt)
4625 sel[3 * i + nelt1] = 3 * i + nelt1;
4626 if (3 * i + nelt2 < nelt)
4627 sel[3 * i + nelt2] = nelt + j2++;
4629 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4631 vect1 = dr_chain[0];
4632 vect2 = dr_chain[1];
4634 /* Create interleaving stmt:
4635 low = VEC_PERM_EXPR <vect1, vect2,
4636 {j, nelt, *, j + 1, nelt + j + 1, *,
4637 j + 2, nelt + j + 2, *, ...}> */
4638 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4639 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4640 vect2, perm3_mask_low);
4641 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4643 vect1 = data_ref;
4644 vect2 = dr_chain[2];
4645 /* Create interleaving stmt:
4646 low = VEC_PERM_EXPR <vect1, vect2,
4647 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4648 6, 7, nelt + j + 2, ...}> */
4649 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4650 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4651 vect2, perm3_mask_high);
4652 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4653 (*result_chain)[j] = data_ref;
4656 else
4658 /* If length is not equal to 3 then only power of 2 is supported. */
4659 gcc_assert (exact_log2 (length) != -1);
4661 for (i = 0, n = nelt / 2; i < n; i++)
4663 sel[i * 2] = i;
4664 sel[i * 2 + 1] = i + nelt;
4666 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4668 for (i = 0; i < nelt; i++)
4669 sel[i] += nelt / 2;
4670 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4672 for (i = 0, n = log_length; i < n; i++)
4674 for (j = 0; j < length/2; j++)
4676 vect1 = dr_chain[j];
4677 vect2 = dr_chain[j+length/2];
4679 /* Create interleaving stmt:
4680 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4681 ...}> */
4682 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4683 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4684 vect2, perm_mask_high);
4685 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4686 (*result_chain)[2*j] = high;
4688 /* Create interleaving stmt:
4689 low = VEC_PERM_EXPR <vect1, vect2,
4690 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4691 ...}> */
4692 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4693 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4694 vect2, perm_mask_low);
4695 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4696 (*result_chain)[2*j+1] = low;
4698 memcpy (dr_chain.address (), result_chain->address (),
4699 length * sizeof (tree));
4704 /* Function vect_setup_realignment
4706 This function is called when vectorizing an unaligned load using
4707 the dr_explicit_realign[_optimized] scheme.
4708 This function generates the following code at the loop prolog:
4710 p = initial_addr;
4711 x msq_init = *(floor(p)); # prolog load
4712 realignment_token = call target_builtin;
4713 loop:
4714 x msq = phi (msq_init, ---)
4716 The stmts marked with x are generated only for the case of
4717 dr_explicit_realign_optimized.
4719 The code above sets up a new (vector) pointer, pointing to the first
4720 location accessed by STMT, and a "floor-aligned" load using that pointer.
4721 It also generates code to compute the "realignment-token" (if the relevant
4722 target hook was defined), and creates a phi-node at the loop-header bb
4723 whose arguments are the result of the prolog-load (created by this
4724 function) and the result of a load that takes place in the loop (to be
4725 created by the caller to this function).
4727 For the case of dr_explicit_realign_optimized:
4728 The caller to this function uses the phi-result (msq) to create the
4729 realignment code inside the loop, and sets up the missing phi argument,
4730 as follows:
4731 loop:
4732 msq = phi (msq_init, lsq)
4733 lsq = *(floor(p')); # load in loop
4734 result = realign_load (msq, lsq, realignment_token);
4736 For the case of dr_explicit_realign:
4737 loop:
4738 msq = *(floor(p)); # load in loop
4739 p' = p + (VS-1);
4740 lsq = *(floor(p')); # load in loop
4741 result = realign_load (msq, lsq, realignment_token);
4743 Input:
4744 STMT - (scalar) load stmt to be vectorized. This load accesses
4745 a memory location that may be unaligned.
4746 BSI - place where new code is to be inserted.
4747 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4748 is used.
4750 Output:
4751 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4752 target hook, if defined.
4753 Return value - the result of the loop-header phi node. */
4755 tree
4756 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4757 tree *realignment_token,
4758 enum dr_alignment_support alignment_support_scheme,
4759 tree init_addr,
4760 struct loop **at_loop)
4762 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4763 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4764 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4765 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4766 struct loop *loop = NULL;
4767 edge pe = NULL;
4768 tree scalar_dest = gimple_assign_lhs (stmt);
4769 tree vec_dest;
4770 gimple *inc;
4771 tree ptr;
4772 tree data_ref;
4773 basic_block new_bb;
4774 tree msq_init = NULL_TREE;
4775 tree new_temp;
4776 gphi *phi_stmt;
4777 tree msq = NULL_TREE;
4778 gimple_seq stmts = NULL;
4779 bool inv_p;
4780 bool compute_in_loop = false;
4781 bool nested_in_vect_loop = false;
4782 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4783 struct loop *loop_for_initial_load = NULL;
4785 if (loop_vinfo)
4787 loop = LOOP_VINFO_LOOP (loop_vinfo);
4788 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4791 gcc_assert (alignment_support_scheme == dr_explicit_realign
4792 || alignment_support_scheme == dr_explicit_realign_optimized);
4794 /* We need to generate three things:
4795 1. the misalignment computation
4796 2. the extra vector load (for the optimized realignment scheme).
4797 3. the phi node for the two vectors from which the realignment is
4798 done (for the optimized realignment scheme). */
4800 /* 1. Determine where to generate the misalignment computation.
4802 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4803 calculation will be generated by this function, outside the loop (in the
4804 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4805 caller, inside the loop.
4807 Background: If the misalignment remains fixed throughout the iterations of
4808 the loop, then both realignment schemes are applicable, and also the
4809 misalignment computation can be done outside LOOP. This is because we are
4810 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4811 are a multiple of VS (the Vector Size), and therefore the misalignment in
4812 different vectorized LOOP iterations is always the same.
4813 The problem arises only if the memory access is in an inner-loop nested
4814 inside LOOP, which is now being vectorized using outer-loop vectorization.
4815 This is the only case when the misalignment of the memory access may not
4816 remain fixed throughout the iterations of the inner-loop (as explained in
4817 detail in vect_supportable_dr_alignment). In this case, not only is the
4818 optimized realignment scheme not applicable, but also the misalignment
4819 computation (and generation of the realignment token that is passed to
4820 REALIGN_LOAD) have to be done inside the loop.
4822 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4823 or not, which in turn determines if the misalignment is computed inside
4824 the inner-loop, or outside LOOP. */
4826 if (init_addr != NULL_TREE || !loop_vinfo)
4828 compute_in_loop = true;
4829 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4833 /* 2. Determine where to generate the extra vector load.
4835 For the optimized realignment scheme, instead of generating two vector
4836 loads in each iteration, we generate a single extra vector load in the
4837 preheader of the loop, and in each iteration reuse the result of the
4838 vector load from the previous iteration. In case the memory access is in
4839 an inner-loop nested inside LOOP, which is now being vectorized using
4840 outer-loop vectorization, we need to determine whether this initial vector
4841 load should be generated at the preheader of the inner-loop, or can be
4842 generated at the preheader of LOOP. If the memory access has no evolution
4843 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4844 to be generated inside LOOP (in the preheader of the inner-loop). */
4846 if (nested_in_vect_loop)
4848 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4849 bool invariant_in_outerloop =
4850 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4851 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4853 else
4854 loop_for_initial_load = loop;
4855 if (at_loop)
4856 *at_loop = loop_for_initial_load;
4858 if (loop_for_initial_load)
4859 pe = loop_preheader_edge (loop_for_initial_load);
4861 /* 3. For the case of the optimized realignment, create the first vector
4862 load at the loop preheader. */
4864 if (alignment_support_scheme == dr_explicit_realign_optimized)
4866 /* Create msq_init = *(floor(p1)) in the loop preheader */
4867 gassign *new_stmt;
4869 gcc_assert (!compute_in_loop);
4870 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4871 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4872 NULL_TREE, &init_addr, NULL, &inc,
4873 true, &inv_p);
4874 if (TREE_CODE (ptr) == SSA_NAME)
4875 new_temp = copy_ssa_name (ptr);
4876 else
4877 new_temp = make_ssa_name (TREE_TYPE (ptr));
4878 new_stmt = gimple_build_assign
4879 (new_temp, BIT_AND_EXPR, ptr,
4880 build_int_cst (TREE_TYPE (ptr),
4881 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4882 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4883 gcc_assert (!new_bb);
4884 data_ref
4885 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4886 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4887 new_stmt = gimple_build_assign (vec_dest, data_ref);
4888 new_temp = make_ssa_name (vec_dest, new_stmt);
4889 gimple_assign_set_lhs (new_stmt, new_temp);
4890 if (pe)
4892 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4893 gcc_assert (!new_bb);
4895 else
4896 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4898 msq_init = gimple_assign_lhs (new_stmt);
4901 /* 4. Create realignment token using a target builtin, if available.
4902 It is done either inside the containing loop, or before LOOP (as
4903 determined above). */
4905 if (targetm.vectorize.builtin_mask_for_load)
4907 gcall *new_stmt;
4908 tree builtin_decl;
4910 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4911 if (!init_addr)
4913 /* Generate the INIT_ADDR computation outside LOOP. */
4914 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4915 NULL_TREE, loop);
4916 if (loop)
4918 pe = loop_preheader_edge (loop);
4919 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4920 gcc_assert (!new_bb);
4922 else
4923 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4926 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4927 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4928 vec_dest =
4929 vect_create_destination_var (scalar_dest,
4930 gimple_call_return_type (new_stmt));
4931 new_temp = make_ssa_name (vec_dest, new_stmt);
4932 gimple_call_set_lhs (new_stmt, new_temp);
4934 if (compute_in_loop)
4935 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4936 else
4938 /* Generate the misalignment computation outside LOOP. */
4939 pe = loop_preheader_edge (loop);
4940 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4941 gcc_assert (!new_bb);
4944 *realignment_token = gimple_call_lhs (new_stmt);
4946 /* The result of the CALL_EXPR to this builtin is determined from
4947 the value of the parameter and no global variables are touched
4948 which makes the builtin a "const" function. Requiring the
4949 builtin to have the "const" attribute makes it unnecessary
4950 to call mark_call_clobbered. */
4951 gcc_assert (TREE_READONLY (builtin_decl));
4954 if (alignment_support_scheme == dr_explicit_realign)
4955 return msq;
4957 gcc_assert (!compute_in_loop);
4958 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4961 /* 5. Create msq = phi <msq_init, lsq> in loop */
4963 pe = loop_preheader_edge (containing_loop);
4964 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4965 msq = make_ssa_name (vec_dest);
4966 phi_stmt = create_phi_node (msq, containing_loop->header);
4967 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4969 return msq;
4973 /* Function vect_grouped_load_supported.
4975 Returns TRUE if even and odd permutations are supported,
4976 and FALSE otherwise. */
4978 bool
4979 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4981 machine_mode mode = TYPE_MODE (vectype);
4983 /* vect_permute_load_chain requires the group size to be equal to 3 or
4984 be a power of two. */
4985 if (count != 3 && exact_log2 (count) == -1)
4987 if (dump_enabled_p ())
4988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4989 "the size of the group of accesses"
4990 " is not a power of 2 or not equal to 3\n");
4991 return false;
4994 /* Check that the permutation is supported. */
4995 if (VECTOR_MODE_P (mode))
4997 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4998 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5000 if (count == 3)
5002 unsigned int k;
5003 for (k = 0; k < 3; k++)
5005 for (i = 0; i < nelt; i++)
5006 if (3 * i + k < 2 * nelt)
5007 sel[i] = 3 * i + k;
5008 else
5009 sel[i] = 0;
5010 if (!can_vec_perm_p (mode, false, sel))
5012 if (dump_enabled_p ())
5013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5014 "shuffle of 3 loads is not supported by"
5015 " target\n");
5016 return false;
5018 for (i = 0, j = 0; i < nelt; i++)
5019 if (3 * i + k < 2 * nelt)
5020 sel[i] = i;
5021 else
5022 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5023 if (!can_vec_perm_p (mode, false, sel))
5025 if (dump_enabled_p ())
5026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5027 "shuffle of 3 loads is not supported by"
5028 " target\n");
5029 return false;
5032 return true;
5034 else
5036 /* If length is not equal to 3 then only power of 2 is supported. */
5037 gcc_assert (exact_log2 (count) != -1);
5038 for (i = 0; i < nelt; i++)
5039 sel[i] = i * 2;
5040 if (can_vec_perm_p (mode, false, sel))
5042 for (i = 0; i < nelt; i++)
5043 sel[i] = i * 2 + 1;
5044 if (can_vec_perm_p (mode, false, sel))
5045 return true;
5050 if (dump_enabled_p ())
5051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5052 "extract even/odd not supported by target\n");
5053 return false;
5056 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5057 type VECTYPE. */
5059 bool
5060 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5062 return vect_lanes_optab_supported_p ("vec_load_lanes",
5063 vec_load_lanes_optab,
5064 vectype, count);
5067 /* Function vect_permute_load_chain.
5069 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5070 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5071 the input data correctly. Return the final references for loads in
5072 RESULT_CHAIN.
5074 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5075 The input is 4 vectors each containing 8 elements. We assign a number to each
5076 element, the input sequence is:
5078 1st vec: 0 1 2 3 4 5 6 7
5079 2nd vec: 8 9 10 11 12 13 14 15
5080 3rd vec: 16 17 18 19 20 21 22 23
5081 4th vec: 24 25 26 27 28 29 30 31
5083 The output sequence should be:
5085 1st vec: 0 4 8 12 16 20 24 28
5086 2nd vec: 1 5 9 13 17 21 25 29
5087 3rd vec: 2 6 10 14 18 22 26 30
5088 4th vec: 3 7 11 15 19 23 27 31
5090 i.e., the first output vector should contain the first elements of each
5091 interleaving group, etc.
5093 We use extract_even/odd instructions to create such output. The input of
5094 each extract_even/odd operation is two vectors
5095 1st vec 2nd vec
5096 0 1 2 3 4 5 6 7
5098 and the output is the vector of extracted even/odd elements. The output of
5099 extract_even will be: 0 2 4 6
5100 and of extract_odd: 1 3 5 7
5103 The permutation is done in log LENGTH stages. In each stage extract_even
5104 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5105 their order. In our example,
5107 E1: extract_even (1st vec, 2nd vec)
5108 E2: extract_odd (1st vec, 2nd vec)
5109 E3: extract_even (3rd vec, 4th vec)
5110 E4: extract_odd (3rd vec, 4th vec)
5112 The output for the first stage will be:
5114 E1: 0 2 4 6 8 10 12 14
5115 E2: 1 3 5 7 9 11 13 15
5116 E3: 16 18 20 22 24 26 28 30
5117 E4: 17 19 21 23 25 27 29 31
5119 In order to proceed and create the correct sequence for the next stage (or
5120 for the correct output, if the second stage is the last one, as in our
5121 example), we first put the output of extract_even operation and then the
5122 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5123 The input for the second stage is:
5125 1st vec (E1): 0 2 4 6 8 10 12 14
5126 2nd vec (E3): 16 18 20 22 24 26 28 30
5127 3rd vec (E2): 1 3 5 7 9 11 13 15
5128 4th vec (E4): 17 19 21 23 25 27 29 31
5130 The output of the second stage:
5132 E1: 0 4 8 12 16 20 24 28
5133 E2: 2 6 10 14 18 22 26 30
5134 E3: 1 5 9 13 17 21 25 29
5135 E4: 3 7 11 15 19 23 27 31
5137 And RESULT_CHAIN after reordering:
5139 1st vec (E1): 0 4 8 12 16 20 24 28
5140 2nd vec (E3): 1 5 9 13 17 21 25 29
5141 3rd vec (E2): 2 6 10 14 18 22 26 30
5142 4th vec (E4): 3 7 11 15 19 23 27 31. */
5144 static void
5145 vect_permute_load_chain (vec<tree> dr_chain,
5146 unsigned int length,
5147 gimple *stmt,
5148 gimple_stmt_iterator *gsi,
5149 vec<tree> *result_chain)
5151 tree data_ref, first_vect, second_vect;
5152 tree perm_mask_even, perm_mask_odd;
5153 tree perm3_mask_low, perm3_mask_high;
5154 gimple *perm_stmt;
5155 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5156 unsigned int i, j, log_length = exact_log2 (length);
5157 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5158 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5160 result_chain->quick_grow (length);
5161 memcpy (result_chain->address (), dr_chain.address (),
5162 length * sizeof (tree));
5164 if (length == 3)
5166 unsigned int k;
5168 for (k = 0; k < 3; k++)
5170 for (i = 0; i < nelt; i++)
5171 if (3 * i + k < 2 * nelt)
5172 sel[i] = 3 * i + k;
5173 else
5174 sel[i] = 0;
5175 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5177 for (i = 0, j = 0; i < nelt; i++)
5178 if (3 * i + k < 2 * nelt)
5179 sel[i] = i;
5180 else
5181 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5183 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5185 first_vect = dr_chain[0];
5186 second_vect = dr_chain[1];
5188 /* Create interleaving stmt (low part of):
5189 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5190 ...}> */
5191 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5192 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5193 second_vect, perm3_mask_low);
5194 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5196 /* Create interleaving stmt (high part of):
5197 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5198 ...}> */
5199 first_vect = data_ref;
5200 second_vect = dr_chain[2];
5201 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5202 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5203 second_vect, perm3_mask_high);
5204 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5205 (*result_chain)[k] = data_ref;
5208 else
5210 /* If length is not equal to 3 then only power of 2 is supported. */
5211 gcc_assert (exact_log2 (length) != -1);
5213 for (i = 0; i < nelt; ++i)
5214 sel[i] = i * 2;
5215 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5217 for (i = 0; i < nelt; ++i)
5218 sel[i] = i * 2 + 1;
5219 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5221 for (i = 0; i < log_length; i++)
5223 for (j = 0; j < length; j += 2)
5225 first_vect = dr_chain[j];
5226 second_vect = dr_chain[j+1];
5228 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5229 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5230 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5231 first_vect, second_vect,
5232 perm_mask_even);
5233 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5234 (*result_chain)[j/2] = data_ref;
5236 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5237 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5238 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5239 first_vect, second_vect,
5240 perm_mask_odd);
5241 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5242 (*result_chain)[j/2+length/2] = data_ref;
5244 memcpy (dr_chain.address (), result_chain->address (),
5245 length * sizeof (tree));
5250 /* Function vect_shift_permute_load_chain.
5252 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5253 sequence of stmts to reorder the input data accordingly.
5254 Return the final references for loads in RESULT_CHAIN.
5255 Return true if successed, false otherwise.
5257 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5258 The input is 3 vectors each containing 8 elements. We assign a
5259 number to each element, the input sequence is:
5261 1st vec: 0 1 2 3 4 5 6 7
5262 2nd vec: 8 9 10 11 12 13 14 15
5263 3rd vec: 16 17 18 19 20 21 22 23
5265 The output sequence should be:
5267 1st vec: 0 3 6 9 12 15 18 21
5268 2nd vec: 1 4 7 10 13 16 19 22
5269 3rd vec: 2 5 8 11 14 17 20 23
5271 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5273 First we shuffle all 3 vectors to get correct elements order:
5275 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5276 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5277 3rd vec: (16 19 22) (17 20 23) (18 21)
5279 Next we unite and shift vector 3 times:
5281 1st step:
5282 shift right by 6 the concatenation of:
5283 "1st vec" and "2nd vec"
5284 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5285 "2nd vec" and "3rd vec"
5286 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5287 "3rd vec" and "1st vec"
5288 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5289 | New vectors |
5291 So that now new vectors are:
5293 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5294 2nd vec: (10 13) (16 19 22) (17 20 23)
5295 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5297 2nd step:
5298 shift right by 5 the concatenation of:
5299 "1st vec" and "3rd vec"
5300 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5301 "2nd vec" and "1st vec"
5302 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5303 "3rd vec" and "2nd vec"
5304 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5305 | New vectors |
5307 So that now new vectors are:
5309 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5310 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5311 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5313 3rd step:
5314 shift right by 5 the concatenation of:
5315 "1st vec" and "1st vec"
5316 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5317 shift right by 3 the concatenation of:
5318 "2nd vec" and "2nd vec"
5319 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5320 | New vectors |
5322 So that now all vectors are READY:
5323 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5324 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5325 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5327 This algorithm is faster than one in vect_permute_load_chain if:
5328 1. "shift of a concatination" is faster than general permutation.
5329 This is usually so.
5330 2. The TARGET machine can't execute vector instructions in parallel.
5331 This is because each step of the algorithm depends on previous.
5332 The algorithm in vect_permute_load_chain is much more parallel.
5334 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5337 static bool
5338 vect_shift_permute_load_chain (vec<tree> dr_chain,
5339 unsigned int length,
5340 gimple *stmt,
5341 gimple_stmt_iterator *gsi,
5342 vec<tree> *result_chain)
5344 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5345 tree perm2_mask1, perm2_mask2, perm3_mask;
5346 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5347 gimple *perm_stmt;
5349 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5350 unsigned int i;
5351 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5352 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5353 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5354 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5356 result_chain->quick_grow (length);
5357 memcpy (result_chain->address (), dr_chain.address (),
5358 length * sizeof (tree));
5360 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5362 unsigned int j, log_length = exact_log2 (length);
5363 for (i = 0; i < nelt / 2; ++i)
5364 sel[i] = i * 2;
5365 for (i = 0; i < nelt / 2; ++i)
5366 sel[nelt / 2 + i] = i * 2 + 1;
5367 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5369 if (dump_enabled_p ())
5370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5371 "shuffle of 2 fields structure is not \
5372 supported by target\n");
5373 return false;
5375 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5377 for (i = 0; i < nelt / 2; ++i)
5378 sel[i] = i * 2 + 1;
5379 for (i = 0; i < nelt / 2; ++i)
5380 sel[nelt / 2 + i] = i * 2;
5381 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5383 if (dump_enabled_p ())
5384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5385 "shuffle of 2 fields structure is not \
5386 supported by target\n");
5387 return false;
5389 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5391 /* Generating permutation constant to shift all elements.
5392 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5393 for (i = 0; i < nelt; i++)
5394 sel[i] = nelt / 2 + i;
5395 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5397 if (dump_enabled_p ())
5398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399 "shift permutation is not supported by target\n");
5400 return false;
5402 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5404 /* Generating permutation constant to select vector from 2.
5405 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5406 for (i = 0; i < nelt / 2; i++)
5407 sel[i] = i;
5408 for (i = nelt / 2; i < nelt; i++)
5409 sel[i] = nelt + i;
5410 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5412 if (dump_enabled_p ())
5413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5414 "select is not supported by target\n");
5415 return false;
5417 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5419 for (i = 0; i < log_length; i++)
5421 for (j = 0; j < length; j += 2)
5423 first_vect = dr_chain[j];
5424 second_vect = dr_chain[j + 1];
5426 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5427 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5428 first_vect, first_vect,
5429 perm2_mask1);
5430 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5431 vect[0] = data_ref;
5433 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5434 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5435 second_vect, second_vect,
5436 perm2_mask2);
5437 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5438 vect[1] = data_ref;
5440 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5441 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5442 vect[0], vect[1], shift1_mask);
5443 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5444 (*result_chain)[j/2 + length/2] = data_ref;
5446 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5447 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5448 vect[0], vect[1], select_mask);
5449 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5450 (*result_chain)[j/2] = data_ref;
5452 memcpy (dr_chain.address (), result_chain->address (),
5453 length * sizeof (tree));
5455 return true;
5457 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5459 unsigned int k = 0, l = 0;
5461 /* Generating permutation constant to get all elements in rigth order.
5462 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5463 for (i = 0; i < nelt; i++)
5465 if (3 * k + (l % 3) >= nelt)
5467 k = 0;
5468 l += (3 - (nelt % 3));
5470 sel[i] = 3 * k + (l % 3);
5471 k++;
5473 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5475 if (dump_enabled_p ())
5476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5477 "shuffle of 3 fields structure is not \
5478 supported by target\n");
5479 return false;
5481 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5483 /* Generating permutation constant to shift all elements.
5484 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5485 for (i = 0; i < nelt; i++)
5486 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5487 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5489 if (dump_enabled_p ())
5490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5491 "shift permutation is not supported by target\n");
5492 return false;
5494 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5496 /* Generating permutation constant to shift all elements.
5497 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5498 for (i = 0; i < nelt; i++)
5499 sel[i] = 2 * (nelt / 3) + 1 + i;
5500 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5502 if (dump_enabled_p ())
5503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5504 "shift permutation is not supported by target\n");
5505 return false;
5507 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5509 /* Generating permutation constant to shift all elements.
5510 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5511 for (i = 0; i < nelt; i++)
5512 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5513 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5515 if (dump_enabled_p ())
5516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5517 "shift permutation is not supported by target\n");
5518 return false;
5520 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5522 /* Generating permutation constant to shift all elements.
5523 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5524 for (i = 0; i < nelt; i++)
5525 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5526 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5528 if (dump_enabled_p ())
5529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5530 "shift permutation is not supported by target\n");
5531 return false;
5533 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5535 for (k = 0; k < 3; k++)
5537 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5538 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5539 dr_chain[k], dr_chain[k],
5540 perm3_mask);
5541 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5542 vect[k] = data_ref;
5545 for (k = 0; k < 3; k++)
5547 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5548 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5549 vect[k % 3], vect[(k + 1) % 3],
5550 shift1_mask);
5551 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5552 vect_shift[k] = data_ref;
5555 for (k = 0; k < 3; k++)
5557 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5558 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5559 vect_shift[(4 - k) % 3],
5560 vect_shift[(3 - k) % 3],
5561 shift2_mask);
5562 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5563 vect[k] = data_ref;
5566 (*result_chain)[3 - (nelt % 3)] = vect[2];
5568 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5569 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5570 vect[0], shift3_mask);
5571 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5572 (*result_chain)[nelt % 3] = data_ref;
5574 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5575 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5576 vect[1], shift4_mask);
5577 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5578 (*result_chain)[0] = data_ref;
5579 return true;
5581 return false;
5584 /* Function vect_transform_grouped_load.
5586 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5587 to perform their permutation and ascribe the result vectorized statements to
5588 the scalar statements.
5591 void
5592 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5593 gimple_stmt_iterator *gsi)
5595 machine_mode mode;
5596 vec<tree> result_chain = vNULL;
5598 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5599 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5600 vectors, that are ready for vector computation. */
5601 result_chain.create (size);
5603 /* If reassociation width for vector type is 2 or greater target machine can
5604 execute 2 or more vector instructions in parallel. Otherwise try to
5605 get chain for loads group using vect_shift_permute_load_chain. */
5606 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5607 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5608 || exact_log2 (size) != -1
5609 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5610 gsi, &result_chain))
5611 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5612 vect_record_grouped_load_vectors (stmt, result_chain);
5613 result_chain.release ();
5616 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5617 generated as part of the vectorization of STMT. Assign the statement
5618 for each vector to the associated scalar statement. */
5620 void
5621 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5623 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5624 gimple *next_stmt, *new_stmt;
5625 unsigned int i, gap_count;
5626 tree tmp_data_ref;
5628 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5629 Since we scan the chain starting from it's first node, their order
5630 corresponds the order of data-refs in RESULT_CHAIN. */
5631 next_stmt = first_stmt;
5632 gap_count = 1;
5633 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5635 if (!next_stmt)
5636 break;
5638 /* Skip the gaps. Loads created for the gaps will be removed by dead
5639 code elimination pass later. No need to check for the first stmt in
5640 the group, since it always exists.
5641 GROUP_GAP is the number of steps in elements from the previous
5642 access (if there is no gap GROUP_GAP is 1). We skip loads that
5643 correspond to the gaps. */
5644 if (next_stmt != first_stmt
5645 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5647 gap_count++;
5648 continue;
5651 while (next_stmt)
5653 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5654 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5655 copies, and we put the new vector statement in the first available
5656 RELATED_STMT. */
5657 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5658 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5659 else
5661 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5663 gimple *prev_stmt =
5664 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5665 gimple *rel_stmt =
5666 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5667 while (rel_stmt)
5669 prev_stmt = rel_stmt;
5670 rel_stmt =
5671 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5674 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5675 new_stmt;
5679 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5680 gap_count = 1;
5681 /* If NEXT_STMT accesses the same DR as the previous statement,
5682 put the same TMP_DATA_REF as its vectorized statement; otherwise
5683 get the next data-ref from RESULT_CHAIN. */
5684 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5685 break;
5690 /* Function vect_force_dr_alignment_p.
5692 Returns whether the alignment of a DECL can be forced to be aligned
5693 on ALIGNMENT bit boundary. */
5695 bool
5696 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5698 if (TREE_CODE (decl) != VAR_DECL)
5699 return false;
5701 if (decl_in_symtab_p (decl)
5702 && !symtab_node::get (decl)->can_increase_alignment_p ())
5703 return false;
5705 if (TREE_STATIC (decl))
5706 return (alignment <= MAX_OFILE_ALIGNMENT);
5707 else
5708 return (alignment <= MAX_STACK_ALIGNMENT);
5712 /* Return whether the data reference DR is supported with respect to its
5713 alignment.
5714 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5715 it is aligned, i.e., check if it is possible to vectorize it with different
5716 alignment. */
5718 enum dr_alignment_support
5719 vect_supportable_dr_alignment (struct data_reference *dr,
5720 bool check_aligned_accesses)
5722 gimple *stmt = DR_STMT (dr);
5723 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5724 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5725 machine_mode mode = TYPE_MODE (vectype);
5726 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5727 struct loop *vect_loop = NULL;
5728 bool nested_in_vect_loop = false;
5730 if (aligned_access_p (dr) && !check_aligned_accesses)
5731 return dr_aligned;
5733 /* For now assume all conditional loads/stores support unaligned
5734 access without any special code. */
5735 if (is_gimple_call (stmt)
5736 && gimple_call_internal_p (stmt)
5737 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5738 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5739 return dr_unaligned_supported;
5741 if (loop_vinfo)
5743 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5744 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5747 /* Possibly unaligned access. */
5749 /* We can choose between using the implicit realignment scheme (generating
5750 a misaligned_move stmt) and the explicit realignment scheme (generating
5751 aligned loads with a REALIGN_LOAD). There are two variants to the
5752 explicit realignment scheme: optimized, and unoptimized.
5753 We can optimize the realignment only if the step between consecutive
5754 vector loads is equal to the vector size. Since the vector memory
5755 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5756 is guaranteed that the misalignment amount remains the same throughout the
5757 execution of the vectorized loop. Therefore, we can create the
5758 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5759 at the loop preheader.
5761 However, in the case of outer-loop vectorization, when vectorizing a
5762 memory access in the inner-loop nested within the LOOP that is now being
5763 vectorized, while it is guaranteed that the misalignment of the
5764 vectorized memory access will remain the same in different outer-loop
5765 iterations, it is *not* guaranteed that is will remain the same throughout
5766 the execution of the inner-loop. This is because the inner-loop advances
5767 with the original scalar step (and not in steps of VS). If the inner-loop
5768 step happens to be a multiple of VS, then the misalignment remains fixed
5769 and we can use the optimized realignment scheme. For example:
5771 for (i=0; i<N; i++)
5772 for (j=0; j<M; j++)
5773 s += a[i+j];
5775 When vectorizing the i-loop in the above example, the step between
5776 consecutive vector loads is 1, and so the misalignment does not remain
5777 fixed across the execution of the inner-loop, and the realignment cannot
5778 be optimized (as illustrated in the following pseudo vectorized loop):
5780 for (i=0; i<N; i+=4)
5781 for (j=0; j<M; j++){
5782 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5783 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5784 // (assuming that we start from an aligned address).
5787 We therefore have to use the unoptimized realignment scheme:
5789 for (i=0; i<N; i+=4)
5790 for (j=k; j<M; j+=4)
5791 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5792 // that the misalignment of the initial address is
5793 // 0).
5795 The loop can then be vectorized as follows:
5797 for (k=0; k<4; k++){
5798 rt = get_realignment_token (&vp[k]);
5799 for (i=0; i<N; i+=4){
5800 v1 = vp[i+k];
5801 for (j=k; j<M; j+=4){
5802 v2 = vp[i+j+VS-1];
5803 va = REALIGN_LOAD <v1,v2,rt>;
5804 vs += va;
5805 v1 = v2;
5808 } */
5810 if (DR_IS_READ (dr))
5812 bool is_packed = false;
5813 tree type = (TREE_TYPE (DR_REF (dr)));
5815 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5816 && (!targetm.vectorize.builtin_mask_for_load
5817 || targetm.vectorize.builtin_mask_for_load ()))
5819 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5820 if ((nested_in_vect_loop
5821 && (TREE_INT_CST_LOW (DR_STEP (dr))
5822 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5823 || !loop_vinfo)
5824 return dr_explicit_realign;
5825 else
5826 return dr_explicit_realign_optimized;
5828 if (!known_alignment_for_access_p (dr))
5829 is_packed = not_size_aligned (DR_REF (dr));
5831 if ((TYPE_USER_ALIGN (type) && !is_packed)
5832 || targetm.vectorize.
5833 support_vector_misalignment (mode, type,
5834 DR_MISALIGNMENT (dr), is_packed))
5835 /* Can't software pipeline the loads, but can at least do them. */
5836 return dr_unaligned_supported;
5838 else
5840 bool is_packed = false;
5841 tree type = (TREE_TYPE (DR_REF (dr)));
5843 if (!known_alignment_for_access_p (dr))
5844 is_packed = not_size_aligned (DR_REF (dr));
5846 if ((TYPE_USER_ALIGN (type) && !is_packed)
5847 || targetm.vectorize.
5848 support_vector_misalignment (mode, type,
5849 DR_MISALIGNMENT (dr), is_packed))
5850 return dr_unaligned_supported;
5853 /* Unsupported. */
5854 return dr_unaligned_unsupported;