c-common.c (c_common_nodes_and_builtins): Use cxx11 in lieu of cxx0x.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob47ecad3528dc1cbff2becd0e4dd079e513590836
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
42 #include "expr.h"
43 #include "optabs.h"
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
48 static bool
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
52 enum machine_mode mode, array_mode;
53 bool limit_p;
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
58 MODE_INT, limit_p);
60 if (array_mode == BLKmode)
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
66 return false;
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
75 return false;
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
83 return true;
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
102 types. */
104 tree
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
122 if (rhs < lhs)
123 scalar_type = rhs_type;
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
128 return scalar_type;
132 /* Check if data references pointed by DR_I and DR_J are same or
133 belong to same interleaving group. Return FALSE if drs are
134 different, otherwise return TRUE. */
136 static bool
137 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
139 gimple stmt_i = DR_STMT (dr_i);
140 gimple stmt_j = DR_STMT (dr_j);
142 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
143 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
144 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
145 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
146 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
147 return true;
148 else
149 return false;
152 /* If address ranges represented by DDR_I and DDR_J are equal,
153 return TRUE, otherwise return FALSE. */
155 static bool
156 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
158 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
159 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
160 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
161 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
162 return true;
163 else
164 return false;
167 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
168 tested at run-time. Return TRUE if DDR was successfully inserted.
169 Return false if versioning is not supported. */
171 static bool
172 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
174 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
177 return false;
179 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location,
182 "mark for run-time aliasing test between ");
183 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
184 dump_printf (MSG_NOTE, " and ");
185 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
188 if (optimize_loop_nest_for_size_p (loop))
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192 "versioning not supported when optimizing for size.");
193 return false;
196 /* FORNOW: We don't support versioning with outer-loop vectorization. */
197 if (loop->inner)
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
201 "versioning not yet supported for outer-loops.");
202 return false;
205 /* FORNOW: We don't support creating runtime alias tests for non-constant
206 step. */
207 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
208 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
210 if (dump_enabled_p ())
211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
212 "versioning not yet supported for non-constant "
213 "step");
214 return false;
217 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
218 return true;
222 /* Function vect_analyze_data_ref_dependence.
224 Return TRUE if there (might) exist a dependence between a memory-reference
225 DRA and a memory-reference DRB. When versioning for alias may check a
226 dependence at run-time, return FALSE. Adjust *MAX_VF according to
227 the data dependence. */
229 static bool
230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
231 loop_vec_info loop_vinfo, int *max_vf)
233 unsigned int i;
234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
235 struct data_reference *dra = DDR_A (ddr);
236 struct data_reference *drb = DDR_B (ddr);
237 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
238 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
239 lambda_vector dist_v;
240 unsigned int loop_depth;
242 /* In loop analysis all data references should be vectorizable. */
243 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
244 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
245 gcc_unreachable ();
247 /* Independent data accesses. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
249 return false;
251 if (dra == drb
252 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
253 return false;
255 /* Unknown data dependence. */
256 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
258 if (STMT_VINFO_GATHER_P (stmtinfo_a)
259 || STMT_VINFO_GATHER_P (stmtinfo_b))
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
264 "versioning for alias not supported for: "
265 "can't determine dependence between ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (dra));
268 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (drb));
272 return true;
275 if (dump_enabled_p ())
277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
278 "versioning for alias required: "
279 "can't determine dependence between ");
280 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
281 DR_REF (dra));
282 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (drb));
287 /* Add to list of ddrs that need to be tested at run-time. */
288 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
291 /* Known data dependence. */
292 if (DDR_NUM_DIST_VECTS (ddr) == 0)
294 if (STMT_VINFO_GATHER_P (stmtinfo_a)
295 || STMT_VINFO_GATHER_P (stmtinfo_b))
297 if (dump_enabled_p ())
299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
300 "versioning for alias not supported for: "
301 "bad dist vector for ");
302 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
303 DR_REF (dra));
304 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
306 DR_REF (drb));
308 return true;
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
314 "versioning for alias required: "
315 "bad dist vector for ");
316 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
317 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
320 /* Add to list of ddrs that need to be tested at run-time. */
321 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
324 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
325 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
327 int dist = dist_v[loop_depth];
329 if (dump_enabled_p ())
330 dump_printf_loc (MSG_NOTE, vect_location,
331 "dependence distance = %d.", dist);
333 if (dist == 0)
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location,
338 "dependence distance == 0 between ");
339 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
340 dump_printf (MSG_NOTE, " and ");
341 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
344 /* When we perform grouped accesses and perform implicit CSE
345 by detecting equal accesses and doing disambiguation with
346 runtime alias tests like for
347 .. = a[i];
348 .. = a[i+1];
349 a[i] = ..;
350 a[i+1] = ..;
351 *p = ..;
352 .. = a[i];
353 .. = a[i+1];
354 where we will end up loading { a[i], a[i+1] } once, make
355 sure that inserting group loads before the first load and
356 stores after the last store will do the right thing. */
357 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
358 && GROUP_SAME_DR_STMT (stmtinfo_a))
359 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
360 && GROUP_SAME_DR_STMT (stmtinfo_b)))
362 gimple earlier_stmt;
363 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
364 if (DR_IS_WRITE
365 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
367 if (dump_enabled_p ())
368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
369 "READ_WRITE dependence in interleaving.");
370 return true;
374 continue;
377 if (dist > 0 && DDR_REVERSED_P (ddr))
379 /* If DDR_REVERSED_P the order of the data-refs in DDR was
380 reversed (to make distance vector positive), and the actual
381 distance is negative. */
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "dependence distance negative.");
385 continue;
388 if (abs (dist) >= 2
389 && abs (dist) < *max_vf)
391 /* The dependence distance requires reduction of the maximal
392 vectorization factor. */
393 *max_vf = abs (dist);
394 if (dump_enabled_p ())
395 dump_printf_loc (MSG_NOTE, vect_location,
396 "adjusting maximal vectorization factor to %i",
397 *max_vf);
400 if (abs (dist) >= *max_vf)
402 /* Dependence distance does not create dependence, as far as
403 vectorization is concerned, in this case. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_NOTE, vect_location,
406 "dependence distance >= VF.");
407 continue;
410 if (dump_enabled_p ())
412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
413 "not vectorized, possible dependence "
414 "between data-refs ");
415 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
416 dump_printf (MSG_NOTE, " and ");
417 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
420 return true;
423 return false;
426 /* Function vect_analyze_data_ref_dependences.
428 Examine all the data references in the loop, and make sure there do not
429 exist any data dependences between them. Set *MAX_VF according to
430 the maximum vectorization factor the data dependences allow. */
432 bool
433 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
435 unsigned int i;
436 struct data_dependence_relation *ddr;
438 if (dump_enabled_p ())
439 dump_printf_loc (MSG_NOTE, vect_location,
440 "=== vect_analyze_data_ref_dependences ===");
442 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
443 &LOOP_VINFO_DDRS (loop_vinfo),
444 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
445 return false;
447 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
448 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
449 return false;
451 return true;
455 /* Function vect_slp_analyze_data_ref_dependence.
457 Return TRUE if there (might) exist a dependence between a memory-reference
458 DRA and a memory-reference DRB. When versioning for alias may check a
459 dependence at run-time, return FALSE. Adjust *MAX_VF according to
460 the data dependence. */
462 static bool
463 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
465 struct data_reference *dra = DDR_A (ddr);
466 struct data_reference *drb = DDR_B (ddr);
468 /* We need to check dependences of statements marked as unvectorizable
469 as well, they still can prohibit vectorization. */
471 /* Independent data accesses. */
472 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
473 return false;
475 if (dra == drb)
476 return false;
478 /* Read-read is OK. */
479 if (DR_IS_READ (dra) && DR_IS_READ (drb))
480 return false;
482 /* If dra and drb are part of the same interleaving chain consider
483 them independent. */
484 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
485 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
486 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
487 return false;
489 /* Unknown data dependence. */
490 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
492 gimple earlier_stmt;
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "can't determine dependence between ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
499 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
500 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
503 /* We do not vectorize basic blocks with write-write dependencies. */
504 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
505 return true;
507 /* Check that it's not a load-after-store dependence. */
508 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
509 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
510 return true;
512 return false;
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location,
518 "determined dependence between ");
519 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
520 dump_printf (MSG_NOTE, " and ");
521 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
524 /* Do not vectorize basic blocks with write-write dependences. */
525 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
526 return true;
528 /* Check dependence between DRA and DRB for basic block vectorization.
529 If the accesses share same bases and offsets, we can compare their initial
530 constant offsets to decide whether they differ or not. In case of a read-
531 write dependence we check that the load is before the store to ensure that
532 vectorization will not change the order of the accesses. */
534 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
535 gimple earlier_stmt;
537 /* Check that the data-refs have same bases and offsets. If not, we can't
538 determine if they are dependent. */
539 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
540 || !dr_equal_offsets_p (dra, drb))
541 return true;
543 /* Check the types. */
544 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
545 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
547 if (type_size_a != type_size_b
548 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
549 TREE_TYPE (DR_REF (drb))))
550 return true;
552 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
553 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
555 /* Two different locations - no dependence. */
556 if (init_a != init_b)
557 return false;
559 /* We have a read-write dependence. Check that the load is before the store.
560 When we vectorize basic blocks, vector load can be only before
561 corresponding scalar load, and vector store can be only after its
562 corresponding scalar store. So the order of the acceses is preserved in
563 case the load is before the store. */
564 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
565 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
566 return false;
568 return true;
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the basic-block, and make sure there
575 do not exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
578 bool
579 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
581 struct data_dependence_relation *ddr;
582 unsigned int i;
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_NOTE, vect_location,
586 "=== vect_slp_analyze_data_ref_dependences ===");
588 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
589 &BB_VINFO_DDRS (bb_vinfo),
590 vNULL, true))
591 return false;
593 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
594 if (vect_slp_analyze_data_ref_dependence (ddr))
595 return false;
597 return true;
601 /* Function vect_compute_data_ref_alignment
603 Compute the misalignment of the data reference DR.
605 Output:
606 1. If during the misalignment computation it is found that the data reference
607 cannot be vectorized then false is returned.
608 2. DR_MISALIGNMENT (DR) is defined.
610 FOR NOW: No analysis is actually performed. Misalignment is calculated
611 only for trivial cases. TODO. */
613 static bool
614 vect_compute_data_ref_alignment (struct data_reference *dr)
616 gimple stmt = DR_STMT (dr);
617 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
618 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
619 struct loop *loop = NULL;
620 tree ref = DR_REF (dr);
621 tree vectype;
622 tree base, base_addr;
623 bool base_aligned;
624 tree misalign;
625 tree aligned_to, alignment;
627 if (dump_enabled_p ())
628 dump_printf_loc (MSG_NOTE, vect_location,
629 "vect_compute_data_ref_alignment:");
631 if (loop_vinfo)
632 loop = LOOP_VINFO_LOOP (loop_vinfo);
634 /* Initialize misalignment to unknown. */
635 SET_DR_MISALIGNMENT (dr, -1);
637 /* Strided loads perform only component accesses, misalignment information
638 is irrelevant for them. */
639 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
640 return true;
642 misalign = DR_INIT (dr);
643 aligned_to = DR_ALIGNED_TO (dr);
644 base_addr = DR_BASE_ADDRESS (dr);
645 vectype = STMT_VINFO_VECTYPE (stmt_info);
647 /* In case the dataref is in an inner-loop of the loop that is being
648 vectorized (LOOP), we use the base and misalignment information
649 relative to the outer-loop (LOOP). This is ok only if the misalignment
650 stays the same throughout the execution of the inner-loop, which is why
651 we have to check that the stride of the dataref in the inner-loop evenly
652 divides by the vector size. */
653 if (loop && nested_in_vect_loop_p (loop, stmt))
655 tree step = DR_STEP (dr);
656 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
658 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "inner step divides the vector-size.");
663 misalign = STMT_VINFO_DR_INIT (stmt_info);
664 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
665 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
667 else
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
671 "inner step doesn't divide the vector-size.");
672 misalign = NULL_TREE;
676 /* Similarly, if we're doing basic-block vectorization, we can only use
677 base and misalignment information relative to an innermost loop if the
678 misalignment stays the same throughout the execution of the loop.
679 As above, this is the case if the stride of the dataref evenly divides
680 by the vector size. */
681 if (!loop)
683 tree step = DR_STEP (dr);
684 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
686 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "SLP: step doesn't divide the vector-size.");
691 misalign = NULL_TREE;
695 base = build_fold_indirect_ref (base_addr);
696 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
698 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
699 || !misalign)
701 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Unknown alignment for access: ");
705 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
707 return true;
710 if ((DECL_P (base)
711 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
712 alignment) >= 0)
713 || (TREE_CODE (base_addr) == SSA_NAME
714 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
715 TREE_TYPE (base_addr)))),
716 alignment) >= 0)
717 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
718 base_aligned = true;
719 else
720 base_aligned = false;
722 if (!base_aligned)
724 /* Do not change the alignment of global variables here if
725 flag_section_anchors is enabled as we already generated
726 RTL for other functions. Most global variables should
727 have been aligned during the IPA increase_alignment pass. */
728 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
729 || (TREE_STATIC (base) && flag_section_anchors))
731 if (dump_enabled_p ())
733 dump_printf_loc (MSG_NOTE, vect_location,
734 "can't force alignment of ref: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
737 return true;
740 /* Force the alignment of the decl.
741 NOTE: This is the only change to the code we make during
742 the analysis phase, before deciding to vectorize the loop. */
743 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
746 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
749 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
750 DECL_USER_ALIGN (base) = 1;
753 /* At this point we assume that the base is aligned. */
754 gcc_assert (base_aligned
755 || (TREE_CODE (base) == VAR_DECL
756 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
758 /* If this is a backward running DR then first access in the larger
759 vectype actually is N-1 elements before the address in the DR.
760 Adjust misalign accordingly. */
761 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
763 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
764 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
765 otherwise we wouldn't be here. */
766 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
767 /* PLUS because DR_STEP was negative. */
768 misalign = size_binop (PLUS_EXPR, misalign, offset);
771 /* Modulo alignment. */
772 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
774 if (!host_integerp (misalign, 1))
776 /* Negative or overflowed misalignment value. */
777 if (dump_enabled_p ())
778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
779 "unexpected misalign value");
780 return false;
783 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
788 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
789 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
792 return true;
796 /* Function vect_compute_data_refs_alignment
798 Compute the misalignment of data references in the loop.
799 Return FALSE if a data reference is found that cannot be vectorized. */
801 static bool
802 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
803 bb_vec_info bb_vinfo)
805 vec<data_reference_p> datarefs;
806 struct data_reference *dr;
807 unsigned int i;
809 if (loop_vinfo)
810 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
811 else
812 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
814 FOR_EACH_VEC_ELT (datarefs, i, dr)
815 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
816 && !vect_compute_data_ref_alignment (dr))
818 if (bb_vinfo)
820 /* Mark unsupported statement as unvectorizable. */
821 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
822 continue;
824 else
825 return false;
828 return true;
832 /* Function vect_update_misalignment_for_peel
834 DR - the data reference whose misalignment is to be adjusted.
835 DR_PEEL - the data reference whose misalignment is being made
836 zero in the vector loop by the peel.
837 NPEEL - the number of iterations in the peel loop if the misalignment
838 of DR_PEEL is known at compile time. */
840 static void
841 vect_update_misalignment_for_peel (struct data_reference *dr,
842 struct data_reference *dr_peel, int npeel)
844 unsigned int i;
845 vec<dr_p> same_align_drs;
846 struct data_reference *current_dr;
847 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
848 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
849 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
850 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
852 /* For interleaved data accesses the step in the loop must be multiplied by
853 the size of the interleaving group. */
854 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
855 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
856 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
857 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
859 /* It can be assumed that the data refs with the same alignment as dr_peel
860 are aligned in the vector loop. */
861 same_align_drs
862 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
863 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
865 if (current_dr != dr)
866 continue;
867 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
868 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
869 SET_DR_MISALIGNMENT (dr, 0);
870 return;
873 if (known_alignment_for_access_p (dr)
874 && known_alignment_for_access_p (dr_peel))
876 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
877 int misal = DR_MISALIGNMENT (dr);
878 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
879 misal += negative ? -npeel * dr_size : npeel * dr_size;
880 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
881 SET_DR_MISALIGNMENT (dr, misal);
882 return;
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
887 SET_DR_MISALIGNMENT (dr, -1);
891 /* Function vect_verify_datarefs_alignment
893 Return TRUE if all data references in the loop can be
894 handled with respect to alignment. */
896 bool
897 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
899 vec<data_reference_p> datarefs;
900 struct data_reference *dr;
901 enum dr_alignment_support supportable_dr_alignment;
902 unsigned int i;
904 if (loop_vinfo)
905 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
906 else
907 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
909 FOR_EACH_VEC_ELT (datarefs, i, dr)
911 gimple stmt = DR_STMT (dr);
912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
914 if (!STMT_VINFO_RELEVANT_P (stmt_info))
915 continue;
917 /* For interleaving, only the alignment of the first access matters.
918 Skip statements marked as not vectorizable. */
919 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
920 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
921 || !STMT_VINFO_VECTORIZABLE (stmt_info))
922 continue;
924 /* Strided loads perform only component accesses, alignment is
925 irrelevant for them. */
926 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
927 continue;
929 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
930 if (!supportable_dr_alignment)
932 if (dump_enabled_p ())
934 if (DR_IS_READ (dr))
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "not vectorized: unsupported unaligned load.");
937 else
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "not vectorized: unsupported unaligned "
940 "store.");
942 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
943 DR_REF (dr));
945 return false;
947 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
948 dump_printf_loc (MSG_NOTE, vect_location,
949 "Vectorizing an unaligned access.");
951 return true;
954 /* Given an memory reference EXP return whether its alignment is less
955 than its size. */
957 static bool
958 not_size_aligned (tree exp)
960 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
961 return true;
963 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
964 > get_object_alignment (exp));
967 /* Function vector_alignment_reachable_p
969 Return true if vector alignment for DR is reachable by peeling
970 a few loop iterations. Return false otherwise. */
972 static bool
973 vector_alignment_reachable_p (struct data_reference *dr)
975 gimple stmt = DR_STMT (dr);
976 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
981 /* For interleaved access we peel only if number of iterations in
982 the prolog loop ({VF - misalignment}), is a multiple of the
983 number of the interleaved accesses. */
984 int elem_size, mis_in_elements;
985 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
987 /* FORNOW: handle only known alignment. */
988 if (!known_alignment_for_access_p (dr))
989 return false;
991 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
992 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
994 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
995 return false;
998 /* If misalignment is known at the compile time then allow peeling
999 only if natural alignment is reachable through peeling. */
1000 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1002 HOST_WIDE_INT elmsize =
1003 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_NOTE, vect_location,
1007 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1008 dump_printf (MSG_NOTE,
1009 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1011 if (DR_MISALIGNMENT (dr) % elmsize)
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "data size does not divide the misalignment.\n");
1016 return false;
1020 if (!known_alignment_for_access_p (dr))
1022 tree type = TREE_TYPE (DR_REF (dr));
1023 bool is_packed = not_size_aligned (DR_REF (dr));
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Unknown misalignment, is_packed = %d",is_packed);
1027 if ((TYPE_USER_ALIGN (type) && !is_packed)
1028 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1029 return true;
1030 else
1031 return false;
1034 return true;
1038 /* Calculate the cost of the memory access represented by DR. */
1040 static void
1041 vect_get_data_access_cost (struct data_reference *dr,
1042 unsigned int *inside_cost,
1043 unsigned int *outside_cost,
1044 stmt_vector_for_cost *body_cost_vec)
1046 gimple stmt = DR_STMT (dr);
1047 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1048 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1049 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1050 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1051 int ncopies = vf / nunits;
1053 if (DR_IS_READ (dr))
1054 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1055 NULL, body_cost_vec, false);
1056 else
1057 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_NOTE, vect_location,
1061 "vect_get_data_access_cost: inside_cost = %d, "
1062 "outside_cost = %d.", *inside_cost, *outside_cost);
1066 /* Insert DR into peeling hash table with NPEEL as key. */
1068 static void
1069 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1070 int npeel)
1072 struct _vect_peel_info elem, *slot;
1073 _vect_peel_info **new_slot;
1074 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1076 elem.npeel = npeel;
1077 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1078 if (slot)
1079 slot->count++;
1080 else
1082 slot = XNEW (struct _vect_peel_info);
1083 slot->npeel = npeel;
1084 slot->dr = dr;
1085 slot->count = 1;
1086 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1087 *new_slot = slot;
1090 if (!supportable_dr_alignment && !flag_vect_cost_model)
1091 slot->count += VECT_MAX_COST;
1095 /* Traverse peeling hash table to find peeling option that aligns maximum
1096 number of data accesses. */
1099 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1100 _vect_peel_extended_info *max)
1102 vect_peel_info elem = *slot;
1104 if (elem->count > max->peel_info.count
1105 || (elem->count == max->peel_info.count
1106 && max->peel_info.npeel > elem->npeel))
1108 max->peel_info.npeel = elem->npeel;
1109 max->peel_info.count = elem->count;
1110 max->peel_info.dr = elem->dr;
1113 return 1;
1117 /* Traverse peeling hash table and calculate cost for each peeling option.
1118 Find the one with the lowest cost. */
1121 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1122 _vect_peel_extended_info *min)
1124 vect_peel_info elem = *slot;
1125 int save_misalignment, dummy;
1126 unsigned int inside_cost = 0, outside_cost = 0, i;
1127 gimple stmt = DR_STMT (elem->dr);
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1129 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1130 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1131 struct data_reference *dr;
1132 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1133 int single_iter_cost;
1135 prologue_cost_vec.create (2);
1136 body_cost_vec.create (2);
1137 epilogue_cost_vec.create (2);
1139 FOR_EACH_VEC_ELT (datarefs, i, dr)
1141 stmt = DR_STMT (dr);
1142 stmt_info = vinfo_for_stmt (stmt);
1143 /* For interleaving, only the alignment of the first access
1144 matters. */
1145 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1146 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1147 continue;
1149 save_misalignment = DR_MISALIGNMENT (dr);
1150 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1151 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1152 &body_cost_vec);
1153 SET_DR_MISALIGNMENT (dr, save_misalignment);
1156 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1157 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1158 &dummy, single_iter_cost,
1159 &prologue_cost_vec,
1160 &epilogue_cost_vec);
1162 /* Prologue and epilogue costs are added to the target model later.
1163 These costs depend only on the scalar iteration cost, the
1164 number of peeling iterations finally chosen, and the number of
1165 misaligned statements. So discard the information found here. */
1166 prologue_cost_vec.release ();
1167 epilogue_cost_vec.release ();
1169 if (inside_cost < min->inside_cost
1170 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1172 min->inside_cost = inside_cost;
1173 min->outside_cost = outside_cost;
1174 min->body_cost_vec.release ();
1175 min->body_cost_vec = body_cost_vec;
1176 min->peel_info.dr = elem->dr;
1177 min->peel_info.npeel = elem->npeel;
1179 else
1180 body_cost_vec.release ();
1182 return 1;
1186 /* Choose best peeling option by traversing peeling hash table and either
1187 choosing an option with the lowest cost (if cost model is enabled) or the
1188 option that aligns as many accesses as possible. */
1190 static struct data_reference *
1191 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1192 unsigned int *npeel,
1193 stmt_vector_for_cost *body_cost_vec)
1195 struct _vect_peel_extended_info res;
1197 res.peel_info.dr = NULL;
1198 res.body_cost_vec = stmt_vector_for_cost();
1200 if (flag_vect_cost_model)
1202 res.inside_cost = INT_MAX;
1203 res.outside_cost = INT_MAX;
1204 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1205 .traverse <_vect_peel_extended_info *,
1206 vect_peeling_hash_get_lowest_cost> (&res);
1208 else
1210 res.peel_info.count = 0;
1211 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1212 .traverse <_vect_peel_extended_info *,
1213 vect_peeling_hash_get_most_frequent> (&res);
1216 *npeel = res.peel_info.npeel;
1217 *body_cost_vec = res.body_cost_vec;
1218 return res.peel_info.dr;
1222 /* Function vect_enhance_data_refs_alignment
1224 This pass will use loop versioning and loop peeling in order to enhance
1225 the alignment of data references in the loop.
1227 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1228 original loop is to be vectorized. Any other loops that are created by
1229 the transformations performed in this pass - are not supposed to be
1230 vectorized. This restriction will be relaxed.
1232 This pass will require a cost model to guide it whether to apply peeling
1233 or versioning or a combination of the two. For example, the scheme that
1234 intel uses when given a loop with several memory accesses, is as follows:
1235 choose one memory access ('p') which alignment you want to force by doing
1236 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1237 other accesses are not necessarily aligned, or (2) use loop versioning to
1238 generate one loop in which all accesses are aligned, and another loop in
1239 which only 'p' is necessarily aligned.
1241 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1242 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1243 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1245 Devising a cost model is the most critical aspect of this work. It will
1246 guide us on which access to peel for, whether to use loop versioning, how
1247 many versions to create, etc. The cost model will probably consist of
1248 generic considerations as well as target specific considerations (on
1249 powerpc for example, misaligned stores are more painful than misaligned
1250 loads).
1252 Here are the general steps involved in alignment enhancements:
1254 -- original loop, before alignment analysis:
1255 for (i=0; i<N; i++){
1256 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1257 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1260 -- After vect_compute_data_refs_alignment:
1261 for (i=0; i<N; i++){
1262 x = q[i]; # DR_MISALIGNMENT(q) = 3
1263 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1266 -- Possibility 1: we do loop versioning:
1267 if (p is aligned) {
1268 for (i=0; i<N; i++){ # loop 1A
1269 x = q[i]; # DR_MISALIGNMENT(q) = 3
1270 p[i] = y; # DR_MISALIGNMENT(p) = 0
1273 else {
1274 for (i=0; i<N; i++){ # loop 1B
1275 x = q[i]; # DR_MISALIGNMENT(q) = 3
1276 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1280 -- Possibility 2: we do loop peeling:
1281 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1282 x = q[i];
1283 p[i] = y;
1285 for (i = 3; i < N; i++){ # loop 2A
1286 x = q[i]; # DR_MISALIGNMENT(q) = 0
1287 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1290 -- Possibility 3: combination of loop peeling and versioning:
1291 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1292 x = q[i];
1293 p[i] = y;
1295 if (p is aligned) {
1296 for (i = 3; i<N; i++){ # loop 3A
1297 x = q[i]; # DR_MISALIGNMENT(q) = 0
1298 p[i] = y; # DR_MISALIGNMENT(p) = 0
1301 else {
1302 for (i = 3; i<N; i++){ # loop 3B
1303 x = q[i]; # DR_MISALIGNMENT(q) = 0
1304 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1308 These loops are later passed to loop_transform to be vectorized. The
1309 vectorizer will use the alignment information to guide the transformation
1310 (whether to generate regular loads/stores, or with special handling for
1311 misalignment). */
1313 bool
1314 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1316 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1317 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1318 enum dr_alignment_support supportable_dr_alignment;
1319 struct data_reference *dr0 = NULL, *first_store = NULL;
1320 struct data_reference *dr;
1321 unsigned int i, j;
1322 bool do_peeling = false;
1323 bool do_versioning = false;
1324 bool stat;
1325 gimple stmt;
1326 stmt_vec_info stmt_info;
1327 unsigned int npeel = 0;
1328 bool all_misalignments_unknown = true;
1329 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1330 unsigned possible_npeel_number = 1;
1331 tree vectype;
1332 unsigned int nelements, mis, same_align_drs_max = 0;
1333 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1335 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_NOTE, vect_location,
1337 "=== vect_enhance_data_refs_alignment ===");
1339 /* While cost model enhancements are expected in the future, the high level
1340 view of the code at this time is as follows:
1342 A) If there is a misaligned access then see if peeling to align
1343 this access can make all data references satisfy
1344 vect_supportable_dr_alignment. If so, update data structures
1345 as needed and return true.
1347 B) If peeling wasn't possible and there is a data reference with an
1348 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1349 then see if loop versioning checks can be used to make all data
1350 references satisfy vect_supportable_dr_alignment. If so, update
1351 data structures as needed and return true.
1353 C) If neither peeling nor versioning were successful then return false if
1354 any data reference does not satisfy vect_supportable_dr_alignment.
1356 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1358 Note, Possibility 3 above (which is peeling and versioning together) is not
1359 being done at this time. */
1361 /* (1) Peeling to force alignment. */
1363 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1364 Considerations:
1365 + How many accesses will become aligned due to the peeling
1366 - How many accesses will become unaligned due to the peeling,
1367 and the cost of misaligned accesses.
1368 - The cost of peeling (the extra runtime checks, the increase
1369 in code size). */
1371 FOR_EACH_VEC_ELT (datarefs, i, dr)
1373 stmt = DR_STMT (dr);
1374 stmt_info = vinfo_for_stmt (stmt);
1376 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1377 continue;
1379 /* For interleaving, only the alignment of the first access
1380 matters. */
1381 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1382 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1383 continue;
1385 /* For invariant accesses there is nothing to enhance. */
1386 if (integer_zerop (DR_STEP (dr)))
1387 continue;
1389 /* Strided loads perform only component accesses, alignment is
1390 irrelevant for them. */
1391 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1392 continue;
1394 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1395 do_peeling = vector_alignment_reachable_p (dr);
1396 if (do_peeling)
1398 if (known_alignment_for_access_p (dr))
1400 unsigned int npeel_tmp;
1401 bool negative = tree_int_cst_compare (DR_STEP (dr),
1402 size_zero_node) < 0;
1404 /* Save info about DR in the hash table. */
1405 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1406 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1408 vectype = STMT_VINFO_VECTYPE (stmt_info);
1409 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1410 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1411 TREE_TYPE (DR_REF (dr))));
1412 npeel_tmp = (negative
1413 ? (mis - nelements) : (nelements - mis))
1414 & (nelements - 1);
1416 /* For multiple types, it is possible that the bigger type access
1417 will have more than one peeling option. E.g., a loop with two
1418 types: one of size (vector size / 4), and the other one of
1419 size (vector size / 8). Vectorization factor will 8. If both
1420 access are misaligned by 3, the first one needs one scalar
1421 iteration to be aligned, and the second one needs 5. But the
1422 the first one will be aligned also by peeling 5 scalar
1423 iterations, and in that case both accesses will be aligned.
1424 Hence, except for the immediate peeling amount, we also want
1425 to try to add full vector size, while we don't exceed
1426 vectorization factor.
1427 We do this automtically for cost model, since we calculate cost
1428 for every peeling option. */
1429 if (!flag_vect_cost_model)
1430 possible_npeel_number = vf /nelements;
1432 /* Handle the aligned case. We may decide to align some other
1433 access, making DR unaligned. */
1434 if (DR_MISALIGNMENT (dr) == 0)
1436 npeel_tmp = 0;
1437 if (!flag_vect_cost_model)
1438 possible_npeel_number++;
1441 for (j = 0; j < possible_npeel_number; j++)
1443 gcc_assert (npeel_tmp <= vf);
1444 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1445 npeel_tmp += nelements;
1448 all_misalignments_unknown = false;
1449 /* Data-ref that was chosen for the case that all the
1450 misalignments are unknown is not relevant anymore, since we
1451 have a data-ref with known alignment. */
1452 dr0 = NULL;
1454 else
1456 /* If we don't know any misalignment values, we prefer
1457 peeling for data-ref that has the maximum number of data-refs
1458 with the same alignment, unless the target prefers to align
1459 stores over load. */
1460 if (all_misalignments_unknown)
1462 unsigned same_align_drs
1463 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1464 if (!dr0
1465 || same_align_drs_max < same_align_drs)
1467 same_align_drs_max = same_align_drs;
1468 dr0 = dr;
1470 /* For data-refs with the same number of related
1471 accesses prefer the one where the misalign
1472 computation will be invariant in the outermost loop. */
1473 else if (same_align_drs_max == same_align_drs)
1475 struct loop *ivloop0, *ivloop;
1476 ivloop0 = outermost_invariant_loop_for_expr
1477 (loop, DR_BASE_ADDRESS (dr0));
1478 ivloop = outermost_invariant_loop_for_expr
1479 (loop, DR_BASE_ADDRESS (dr));
1480 if ((ivloop && !ivloop0)
1481 || (ivloop && ivloop0
1482 && flow_loop_nested_p (ivloop, ivloop0)))
1483 dr0 = dr;
1486 if (!first_store && DR_IS_WRITE (dr))
1487 first_store = dr;
1490 /* If there are both known and unknown misaligned accesses in the
1491 loop, we choose peeling amount according to the known
1492 accesses. */
1493 if (!supportable_dr_alignment)
1495 dr0 = dr;
1496 if (!first_store && DR_IS_WRITE (dr))
1497 first_store = dr;
1501 else
1503 if (!aligned_access_p (dr))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "vector alignment may not be reachable");
1508 break;
1513 /* Check if we can possibly peel the loop. */
1514 if (!vect_can_advance_ivs_p (loop_vinfo)
1515 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1516 do_peeling = false;
1518 if (do_peeling && all_misalignments_unknown
1519 && vect_supportable_dr_alignment (dr0, false))
1522 /* Check if the target requires to prefer stores over loads, i.e., if
1523 misaligned stores are more expensive than misaligned loads (taking
1524 drs with same alignment into account). */
1525 if (first_store && DR_IS_READ (dr0))
1527 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1528 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1529 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1530 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1531 stmt_vector_for_cost dummy;
1532 dummy.create (2);
1534 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1535 &dummy);
1536 vect_get_data_access_cost (first_store, &store_inside_cost,
1537 &store_outside_cost, &dummy);
1539 dummy.release ();
1541 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1542 aligning the load DR0). */
1543 load_inside_penalty = store_inside_cost;
1544 load_outside_penalty = store_outside_cost;
1545 for (i = 0;
1546 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1547 DR_STMT (first_store))).iterate (i, &dr);
1548 i++)
1549 if (DR_IS_READ (dr))
1551 load_inside_penalty += load_inside_cost;
1552 load_outside_penalty += load_outside_cost;
1554 else
1556 load_inside_penalty += store_inside_cost;
1557 load_outside_penalty += store_outside_cost;
1560 /* Calculate the penalty for leaving DR0 unaligned (by
1561 aligning the FIRST_STORE). */
1562 store_inside_penalty = load_inside_cost;
1563 store_outside_penalty = load_outside_cost;
1564 for (i = 0;
1565 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1566 DR_STMT (dr0))).iterate (i, &dr);
1567 i++)
1568 if (DR_IS_READ (dr))
1570 store_inside_penalty += load_inside_cost;
1571 store_outside_penalty += load_outside_cost;
1573 else
1575 store_inside_penalty += store_inside_cost;
1576 store_outside_penalty += store_outside_cost;
1579 if (load_inside_penalty > store_inside_penalty
1580 || (load_inside_penalty == store_inside_penalty
1581 && load_outside_penalty > store_outside_penalty))
1582 dr0 = first_store;
1585 /* In case there are only loads with different unknown misalignments, use
1586 peeling only if it may help to align other accesses in the loop. */
1587 if (!first_store
1588 && !STMT_VINFO_SAME_ALIGN_REFS (
1589 vinfo_for_stmt (DR_STMT (dr0))).length ()
1590 && vect_supportable_dr_alignment (dr0, false)
1591 != dr_unaligned_supported)
1592 do_peeling = false;
1595 if (do_peeling && !dr0)
1597 /* Peeling is possible, but there is no data access that is not supported
1598 unless aligned. So we try to choose the best possible peeling. */
1600 /* We should get here only if there are drs with known misalignment. */
1601 gcc_assert (!all_misalignments_unknown);
1603 /* Choose the best peeling from the hash table. */
1604 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1605 &body_cost_vec);
1606 if (!dr0 || !npeel)
1607 do_peeling = false;
1610 if (do_peeling)
1612 stmt = DR_STMT (dr0);
1613 stmt_info = vinfo_for_stmt (stmt);
1614 vectype = STMT_VINFO_VECTYPE (stmt_info);
1615 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1617 if (known_alignment_for_access_p (dr0))
1619 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1620 size_zero_node) < 0;
1621 if (!npeel)
1623 /* Since it's known at compile time, compute the number of
1624 iterations in the peeled loop (the peeling factor) for use in
1625 updating DR_MISALIGNMENT values. The peeling factor is the
1626 vectorization factor minus the misalignment as an element
1627 count. */
1628 mis = DR_MISALIGNMENT (dr0);
1629 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1630 npeel = ((negative ? mis - nelements : nelements - mis)
1631 & (nelements - 1));
1634 /* For interleaved data access every iteration accesses all the
1635 members of the group, therefore we divide the number of iterations
1636 by the group size. */
1637 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1638 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1639 npeel /= GROUP_SIZE (stmt_info);
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "Try peeling by %d", npeel);
1646 /* Ensure that all data refs can be vectorized after the peel. */
1647 FOR_EACH_VEC_ELT (datarefs, i, dr)
1649 int save_misalignment;
1651 if (dr == dr0)
1652 continue;
1654 stmt = DR_STMT (dr);
1655 stmt_info = vinfo_for_stmt (stmt);
1656 /* For interleaving, only the alignment of the first access
1657 matters. */
1658 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1659 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1660 continue;
1662 /* Strided loads perform only component accesses, alignment is
1663 irrelevant for them. */
1664 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1665 continue;
1667 save_misalignment = DR_MISALIGNMENT (dr);
1668 vect_update_misalignment_for_peel (dr, dr0, npeel);
1669 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1670 SET_DR_MISALIGNMENT (dr, save_misalignment);
1672 if (!supportable_dr_alignment)
1674 do_peeling = false;
1675 break;
1679 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1681 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1682 if (!stat)
1683 do_peeling = false;
1684 else
1686 body_cost_vec.release ();
1687 return stat;
1691 if (do_peeling)
1693 stmt_info_for_cost *si;
1694 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1696 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1697 If the misalignment of DR_i is identical to that of dr0 then set
1698 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1699 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1700 by the peeling factor times the element size of DR_i (MOD the
1701 vectorization factor times the size). Otherwise, the
1702 misalignment of DR_i must be set to unknown. */
1703 FOR_EACH_VEC_ELT (datarefs, i, dr)
1704 if (dr != dr0)
1705 vect_update_misalignment_for_peel (dr, dr0, npeel);
1707 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1708 if (npeel)
1709 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1710 else
1711 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1712 SET_DR_MISALIGNMENT (dr0, 0);
1713 if (dump_enabled_p ())
1715 dump_printf_loc (MSG_NOTE, vect_location,
1716 "Alignment of access forced using peeling.");
1717 dump_printf_loc (MSG_NOTE, vect_location,
1718 "Peeling for alignment will be applied.");
1720 /* We've delayed passing the inside-loop peeling costs to the
1721 target cost model until we were sure peeling would happen.
1722 Do so now. */
1723 if (body_cost_vec.exists ())
1725 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1727 struct _stmt_vec_info *stmt_info
1728 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1729 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1730 si->misalign, vect_body);
1732 body_cost_vec.release ();
1735 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1736 gcc_assert (stat);
1737 return stat;
1741 body_cost_vec.release ();
1743 /* (2) Versioning to force alignment. */
1745 /* Try versioning if:
1746 1) flag_tree_vect_loop_version is TRUE
1747 2) optimize loop for speed
1748 3) there is at least one unsupported misaligned data ref with an unknown
1749 misalignment, and
1750 4) all misaligned data refs with a known misalignment are supported, and
1751 5) the number of runtime alignment checks is within reason. */
1753 do_versioning =
1754 flag_tree_vect_loop_version
1755 && optimize_loop_nest_for_speed_p (loop)
1756 && (!loop->inner); /* FORNOW */
1758 if (do_versioning)
1760 FOR_EACH_VEC_ELT (datarefs, i, dr)
1762 stmt = DR_STMT (dr);
1763 stmt_info = vinfo_for_stmt (stmt);
1765 /* For interleaving, only the alignment of the first access
1766 matters. */
1767 if (aligned_access_p (dr)
1768 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1769 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1770 continue;
1772 /* Strided loads perform only component accesses, alignment is
1773 irrelevant for them. */
1774 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1775 continue;
1777 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1779 if (!supportable_dr_alignment)
1781 gimple stmt;
1782 int mask;
1783 tree vectype;
1785 if (known_alignment_for_access_p (dr)
1786 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1787 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1789 do_versioning = false;
1790 break;
1793 stmt = DR_STMT (dr);
1794 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1795 gcc_assert (vectype);
1797 /* The rightmost bits of an aligned address must be zeros.
1798 Construct the mask needed for this test. For example,
1799 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1800 mask must be 15 = 0xf. */
1801 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1803 /* FORNOW: use the same mask to test all potentially unaligned
1804 references in the loop. The vectorizer currently supports
1805 a single vector size, see the reference to
1806 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1807 vectorization factor is computed. */
1808 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1809 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1810 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1811 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1812 DR_STMT (dr));
1816 /* Versioning requires at least one misaligned data reference. */
1817 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1818 do_versioning = false;
1819 else if (!do_versioning)
1820 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1823 if (do_versioning)
1825 vec<gimple> may_misalign_stmts
1826 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1827 gimple stmt;
1829 /* It can now be assumed that the data references in the statements
1830 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1831 of the loop being vectorized. */
1832 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1834 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1835 dr = STMT_VINFO_DATA_REF (stmt_info);
1836 SET_DR_MISALIGNMENT (dr, 0);
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_NOTE, vect_location,
1839 "Alignment of access forced using versioning.");
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_NOTE, vect_location,
1844 "Versioning for alignment will be applied.");
1846 /* Peeling and versioning can't be done together at this time. */
1847 gcc_assert (! (do_peeling && do_versioning));
1849 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1850 gcc_assert (stat);
1851 return stat;
1854 /* This point is reached if neither peeling nor versioning is being done. */
1855 gcc_assert (! (do_peeling || do_versioning));
1857 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1858 return stat;
1862 /* Function vect_find_same_alignment_drs.
1864 Update group and alignment relations according to the chosen
1865 vectorization factor. */
1867 static void
1868 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1869 loop_vec_info loop_vinfo)
1871 unsigned int i;
1872 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1873 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1874 struct data_reference *dra = DDR_A (ddr);
1875 struct data_reference *drb = DDR_B (ddr);
1876 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1877 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1878 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1879 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1880 lambda_vector dist_v;
1881 unsigned int loop_depth;
1883 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1884 return;
1886 if (dra == drb)
1887 return;
1889 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1890 return;
1892 /* Loop-based vectorization and known data dependence. */
1893 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1894 return;
1896 /* Data-dependence analysis reports a distance vector of zero
1897 for data-references that overlap only in the first iteration
1898 but have different sign step (see PR45764).
1899 So as a sanity check require equal DR_STEP. */
1900 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1901 return;
1903 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1904 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1906 int dist = dist_v[loop_depth];
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_NOTE, vect_location,
1910 "dependence distance = %d.", dist);
1912 /* Same loop iteration. */
1913 if (dist == 0
1914 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1916 /* Two references with distance zero have the same alignment. */
1917 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1918 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1919 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location,
1922 "accesses have the same alignment.");
1923 dump_printf (MSG_NOTE,
1924 "dependence distance modulo vf == 0 between ");
1925 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1926 dump_printf (MSG_NOTE, " and ");
1927 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1934 /* Function vect_analyze_data_refs_alignment
1936 Analyze the alignment of the data-references in the loop.
1937 Return FALSE if a data reference is found that cannot be vectorized. */
1939 bool
1940 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1941 bb_vec_info bb_vinfo)
1943 if (dump_enabled_p ())
1944 dump_printf_loc (MSG_NOTE, vect_location,
1945 "=== vect_analyze_data_refs_alignment ===");
1947 /* Mark groups of data references with same alignment using
1948 data dependence information. */
1949 if (loop_vinfo)
1951 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1952 struct data_dependence_relation *ddr;
1953 unsigned int i;
1955 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1956 vect_find_same_alignment_drs (ddr, loop_vinfo);
1959 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1961 if (dump_enabled_p ())
1962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1963 "not vectorized: can't calculate alignment "
1964 "for data ref.");
1965 return false;
1968 return true;
1972 /* Analyze groups of accesses: check that DR belongs to a group of
1973 accesses of legal size, step, etc. Detect gaps, single element
1974 interleaving, and other special cases. Set grouped access info.
1975 Collect groups of strided stores for further use in SLP analysis. */
1977 static bool
1978 vect_analyze_group_access (struct data_reference *dr)
1980 tree step = DR_STEP (dr);
1981 tree scalar_type = TREE_TYPE (DR_REF (dr));
1982 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1983 gimple stmt = DR_STMT (dr);
1984 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1985 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1986 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1987 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1988 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1989 bool slp_impossible = false;
1990 struct loop *loop = NULL;
1992 if (loop_vinfo)
1993 loop = LOOP_VINFO_LOOP (loop_vinfo);
1995 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
1996 size of the interleaving group (including gaps). */
1997 groupsize = absu_hwi (dr_step) / type_size;
1999 /* Not consecutive access is possible only if it is a part of interleaving. */
2000 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2002 /* Check if it this DR is a part of interleaving, and is a single
2003 element of the group that is accessed in the loop. */
2005 /* Gaps are supported only for loads. STEP must be a multiple of the type
2006 size. The size of the group must be a power of 2. */
2007 if (DR_IS_READ (dr)
2008 && (dr_step % type_size) == 0
2009 && groupsize > 0
2010 && exact_log2 (groupsize) != -1)
2012 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2013 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2014 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_NOTE, vect_location,
2017 "Detected single element interleaving ");
2018 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2019 dump_printf (MSG_NOTE, " step ");
2020 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2023 if (loop_vinfo)
2025 if (dump_enabled_p ())
2026 dump_printf_loc (MSG_NOTE, vect_location,
2027 "Data access with gaps requires scalar "
2028 "epilogue loop");
2029 if (loop->inner)
2031 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033 "Peeling for outer loop is not"
2034 " supported");
2035 return false;
2038 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2041 return true;
2044 if (dump_enabled_p ())
2046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047 "not consecutive access ");
2048 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2051 if (bb_vinfo)
2053 /* Mark the statement as unvectorizable. */
2054 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2055 return true;
2058 return false;
2061 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2063 /* First stmt in the interleaving chain. Check the chain. */
2064 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2065 struct data_reference *data_ref = dr;
2066 unsigned int count = 1;
2067 tree prev_init = DR_INIT (data_ref);
2068 gimple prev = stmt;
2069 HOST_WIDE_INT diff, gaps = 0;
2070 unsigned HOST_WIDE_INT count_in_bytes;
2072 while (next)
2074 /* Skip same data-refs. In case that two or more stmts share
2075 data-ref (supported only for loads), we vectorize only the first
2076 stmt, and the rest get their vectorized loads from the first
2077 one. */
2078 if (!tree_int_cst_compare (DR_INIT (data_ref),
2079 DR_INIT (STMT_VINFO_DATA_REF (
2080 vinfo_for_stmt (next)))))
2082 if (DR_IS_WRITE (data_ref))
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2086 "Two store stmts share the same dr.");
2087 return false;
2090 /* For load use the same data-ref load. */
2091 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2093 prev = next;
2094 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2095 continue;
2098 prev = next;
2099 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2101 /* All group members have the same STEP by construction. */
2102 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2104 /* Check that the distance between two accesses is equal to the type
2105 size. Otherwise, we have gaps. */
2106 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2107 - TREE_INT_CST_LOW (prev_init)) / type_size;
2108 if (diff != 1)
2110 /* FORNOW: SLP of accesses with gaps is not supported. */
2111 slp_impossible = true;
2112 if (DR_IS_WRITE (data_ref))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "interleaved store with gaps");
2117 return false;
2120 gaps += diff - 1;
2123 last_accessed_element += diff;
2125 /* Store the gap from the previous member of the group. If there is no
2126 gap in the access, GROUP_GAP is always 1. */
2127 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2129 prev_init = DR_INIT (data_ref);
2130 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2131 /* Count the number of data-refs in the chain. */
2132 count++;
2135 /* COUNT is the number of accesses found, we multiply it by the size of
2136 the type to get COUNT_IN_BYTES. */
2137 count_in_bytes = type_size * count;
2139 /* Check that the size of the interleaving (including gaps) is not
2140 greater than STEP. */
2141 if (dr_step != 0
2142 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2144 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "interleaving size is greater than step for ");
2148 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2150 return false;
2153 /* Check that the size of the interleaving is equal to STEP for stores,
2154 i.e., that there are no gaps. */
2155 if (dr_step != 0
2156 && absu_hwi (dr_step) != count_in_bytes)
2158 if (DR_IS_READ (dr))
2160 slp_impossible = true;
2161 /* There is a gap after the last load in the group. This gap is a
2162 difference between the groupsize and the number of elements.
2163 When there is no gap, this difference should be 0. */
2164 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2166 else
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170 "interleaved store with gaps");
2171 return false;
2175 /* Check that STEP is a multiple of type size. */
2176 if (dr_step != 0
2177 && (dr_step % type_size) != 0)
2179 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "step is not a multiple of type size: step ");
2183 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2184 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2186 TYPE_SIZE_UNIT (scalar_type));
2188 return false;
2191 if (groupsize == 0)
2192 groupsize = count;
2194 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2195 if (dump_enabled_p ())
2196 dump_printf_loc (MSG_NOTE, vect_location,
2197 "Detected interleaving of size %d", (int)groupsize);
2199 /* SLP: create an SLP data structure for every interleaving group of
2200 stores for further analysis in vect_analyse_slp. */
2201 if (DR_IS_WRITE (dr) && !slp_impossible)
2203 if (loop_vinfo)
2204 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2205 if (bb_vinfo)
2206 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2209 /* There is a gap in the end of the group. */
2210 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 "Data access with gaps requires scalar "
2215 "epilogue loop");
2216 if (loop->inner)
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "Peeling for outer loop is not supported");
2221 return false;
2224 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2228 return true;
2232 /* Analyze the access pattern of the data-reference DR.
2233 In case of non-consecutive accesses call vect_analyze_group_access() to
2234 analyze groups of accesses. */
2236 static bool
2237 vect_analyze_data_ref_access (struct data_reference *dr)
2239 tree step = DR_STEP (dr);
2240 tree scalar_type = TREE_TYPE (DR_REF (dr));
2241 gimple stmt = DR_STMT (dr);
2242 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2243 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2244 struct loop *loop = NULL;
2246 if (loop_vinfo)
2247 loop = LOOP_VINFO_LOOP (loop_vinfo);
2249 if (loop_vinfo && !step)
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253 "bad data-ref access in loop");
2254 return false;
2257 /* Allow invariant loads in loops. */
2258 if (loop_vinfo && integer_zerop (step))
2260 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2261 return DR_IS_READ (dr);
2264 if (loop && nested_in_vect_loop_p (loop, stmt))
2266 /* Interleaved accesses are not yet supported within outer-loop
2267 vectorization for references in the inner-loop. */
2268 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2270 /* For the rest of the analysis we use the outer-loop step. */
2271 step = STMT_VINFO_DR_STEP (stmt_info);
2272 if (integer_zerop (step))
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_NOTE, vect_location,
2276 "zero step in outer loop.");
2277 if (DR_IS_READ (dr))
2278 return true;
2279 else
2280 return false;
2284 /* Consecutive? */
2285 if (TREE_CODE (step) == INTEGER_CST)
2287 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2288 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2289 || (dr_step < 0
2290 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2292 /* Mark that it is not interleaving. */
2293 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2294 return true;
2298 if (loop && nested_in_vect_loop_p (loop, stmt))
2300 if (dump_enabled_p ())
2301 dump_printf_loc (MSG_NOTE, vect_location,
2302 "grouped access in outer loop.");
2303 return false;
2306 /* Assume this is a DR handled by non-constant strided load case. */
2307 if (TREE_CODE (step) != INTEGER_CST)
2308 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2310 /* Not consecutive access - check if it's a part of interleaving group. */
2311 return vect_analyze_group_access (dr);
2314 /* Compare two data-references DRA and DRB to group them into chunks
2315 suitable for grouping. */
2317 static int
2318 dr_group_sort_cmp (const void *dra_, const void *drb_)
2320 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2321 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2322 hashval_t h1, h2;
2323 int cmp;
2325 /* Stabilize sort. */
2326 if (dra == drb)
2327 return 0;
2329 /* Ordering of DRs according to base. */
2330 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2332 h1 = iterative_hash_expr (DR_BASE_ADDRESS (dra), 0);
2333 h2 = iterative_hash_expr (DR_BASE_ADDRESS (drb), 0);
2334 if (h1 != h2)
2335 return h1 < h2 ? -1 : 1;
2338 /* And according to DR_OFFSET. */
2339 if (!dr_equal_offsets_p (dra, drb))
2341 h1 = iterative_hash_expr (DR_OFFSET (dra), 0);
2342 h2 = iterative_hash_expr (DR_OFFSET (drb), 0);
2343 if (h1 != h2)
2344 return h1 < h2 ? -1 : 1;
2347 /* Put reads before writes. */
2348 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2349 return DR_IS_READ (dra) ? -1 : 1;
2351 /* Then sort after access size. */
2352 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2353 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2355 h1 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 0);
2356 h2 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0);
2357 if (h1 != h2)
2358 return h1 < h2 ? -1 : 1;
2361 /* And after step. */
2362 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2364 h1 = iterative_hash_expr (DR_STEP (dra), 0);
2365 h2 = iterative_hash_expr (DR_STEP (drb), 0);
2366 if (h1 != h2)
2367 return h1 < h2 ? -1 : 1;
2370 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2371 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2372 if (cmp == 0)
2373 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2374 return cmp;
2377 /* Function vect_analyze_data_ref_accesses.
2379 Analyze the access pattern of all the data references in the loop.
2381 FORNOW: the only access pattern that is considered vectorizable is a
2382 simple step 1 (consecutive) access.
2384 FORNOW: handle only arrays and pointer accesses. */
2386 bool
2387 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2389 unsigned int i;
2390 vec<data_reference_p> datarefs;
2391 struct data_reference *dr;
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_NOTE, vect_location,
2395 "=== vect_analyze_data_ref_accesses ===");
2397 if (loop_vinfo)
2398 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2399 else
2400 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2402 if (datarefs.is_empty ())
2403 return true;
2405 /* Sort the array of datarefs to make building the interleaving chains
2406 linear. */
2407 qsort (datarefs.address(), datarefs.length (),
2408 sizeof (data_reference_p), dr_group_sort_cmp);
2410 /* Build the interleaving chains. */
2411 for (i = 0; i < datarefs.length () - 1;)
2413 data_reference_p dra = datarefs[i];
2414 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2415 stmt_vec_info lastinfo = NULL;
2416 for (i = i + 1; i < datarefs.length (); ++i)
2418 data_reference_p drb = datarefs[i];
2419 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2421 /* ??? Imperfect sorting (non-compatible types, non-modulo
2422 accesses, same accesses) can lead to a group to be artificially
2423 split here as we don't just skip over those. If it really
2424 matters we can push those to a worklist and re-iterate
2425 over them. The we can just skip ahead to the next DR here. */
2427 /* Check that the data-refs have same first location (except init)
2428 and they are both either store or load (not load and store). */
2429 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2430 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2431 DR_BASE_ADDRESS (drb), 0)
2432 || !dr_equal_offsets_p (dra, drb))
2433 break;
2435 /* Check that the data-refs have the same constant size and step. */
2436 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2437 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2438 if (!host_integerp (sza, 1)
2439 || !host_integerp (szb, 1)
2440 || !tree_int_cst_equal (sza, szb)
2441 || !host_integerp (DR_STEP (dra), 0)
2442 || !host_integerp (DR_STEP (drb), 0)
2443 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2444 break;
2446 /* Do not place the same access in the interleaving chain twice. */
2447 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2448 break;
2450 /* Check the types are compatible.
2451 ??? We don't distinguish this during sorting. */
2452 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2453 TREE_TYPE (DR_REF (drb))))
2454 break;
2456 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2457 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2458 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2459 gcc_assert (init_a < init_b);
2461 /* If init_b == init_a + the size of the type * k, we have an
2462 interleaving, and DRA is accessed before DRB. */
2463 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2464 if ((init_b - init_a) % type_size_a != 0)
2465 break;
2467 /* The step (if not zero) is greater than the difference between
2468 data-refs' inits. This splits groups into suitable sizes. */
2469 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2470 if (step != 0 && step <= (init_b - init_a))
2471 break;
2473 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE, vect_location,
2476 "Detected interleaving ");
2477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2478 dump_printf (MSG_NOTE, " and ");
2479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2482 /* Link the found element into the group list. */
2483 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2485 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2486 lastinfo = stmtinfo_a;
2488 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2489 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2490 lastinfo = stmtinfo_b;
2494 FOR_EACH_VEC_ELT (datarefs, i, dr)
2495 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2496 && !vect_analyze_data_ref_access (dr))
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2500 "not vectorized: complicated access pattern.");
2502 if (bb_vinfo)
2504 /* Mark the statement as not vectorizable. */
2505 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2506 continue;
2508 else
2509 return false;
2512 return true;
2515 /* Function vect_prune_runtime_alias_test_list.
2517 Prune a list of ddrs to be tested at run-time by versioning for alias.
2518 Return FALSE if resulting list of ddrs is longer then allowed by
2519 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2521 bool
2522 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2524 vec<ddr_p> ddrs =
2525 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2526 unsigned i, j;
2528 if (dump_enabled_p ())
2529 dump_printf_loc (MSG_NOTE, vect_location,
2530 "=== vect_prune_runtime_alias_test_list ===");
2532 for (i = 0; i < ddrs.length (); )
2534 bool found;
2535 ddr_p ddr_i;
2537 ddr_i = ddrs[i];
2538 found = false;
2540 for (j = 0; j < i; j++)
2542 ddr_p ddr_j = ddrs[j];
2544 if (vect_vfa_range_equal (ddr_i, ddr_j))
2546 if (dump_enabled_p ())
2548 dump_printf_loc (MSG_NOTE, vect_location,
2549 "found equal ranges ");
2550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2551 dump_printf (MSG_NOTE, ", ");
2552 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2553 dump_printf (MSG_NOTE, " and ");
2554 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2555 dump_printf (MSG_NOTE, ", ");
2556 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2558 found = true;
2559 break;
2563 if (found)
2565 ddrs.ordered_remove (i);
2566 continue;
2568 i++;
2571 if (ddrs.length () >
2572 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2574 if (dump_enabled_p ())
2576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2577 "disable versioning for alias - max number of "
2578 "generated checks exceeded.");
2581 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2583 return false;
2586 return true;
2589 /* Check whether a non-affine read in stmt is suitable for gather load
2590 and if so, return a builtin decl for that operation. */
2592 tree
2593 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2594 tree *offp, int *scalep)
2596 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2597 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2598 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2599 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2600 tree offtype = NULL_TREE;
2601 tree decl, base, off;
2602 enum machine_mode pmode;
2603 int punsignedp, pvolatilep;
2605 /* The gather builtins need address of the form
2606 loop_invariant + vector * {1, 2, 4, 8}
2608 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2609 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2610 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2611 multiplications and additions in it. To get a vector, we need
2612 a single SSA_NAME that will be defined in the loop and will
2613 contain everything that is not loop invariant and that can be
2614 vectorized. The following code attempts to find such a preexistng
2615 SSA_NAME OFF and put the loop invariants into a tree BASE
2616 that can be gimplified before the loop. */
2617 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2618 &pmode, &punsignedp, &pvolatilep, false);
2619 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2621 if (TREE_CODE (base) == MEM_REF)
2623 if (!integer_zerop (TREE_OPERAND (base, 1)))
2625 if (off == NULL_TREE)
2627 double_int moff = mem_ref_offset (base);
2628 off = double_int_to_tree (sizetype, moff);
2630 else
2631 off = size_binop (PLUS_EXPR, off,
2632 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2634 base = TREE_OPERAND (base, 0);
2636 else
2637 base = build_fold_addr_expr (base);
2639 if (off == NULL_TREE)
2640 off = size_zero_node;
2642 /* If base is not loop invariant, either off is 0, then we start with just
2643 the constant offset in the loop invariant BASE and continue with base
2644 as OFF, otherwise give up.
2645 We could handle that case by gimplifying the addition of base + off
2646 into some SSA_NAME and use that as off, but for now punt. */
2647 if (!expr_invariant_in_loop_p (loop, base))
2649 if (!integer_zerop (off))
2650 return NULL_TREE;
2651 off = base;
2652 base = size_int (pbitpos / BITS_PER_UNIT);
2654 /* Otherwise put base + constant offset into the loop invariant BASE
2655 and continue with OFF. */
2656 else
2658 base = fold_convert (sizetype, base);
2659 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2662 /* OFF at this point may be either a SSA_NAME or some tree expression
2663 from get_inner_reference. Try to peel off loop invariants from it
2664 into BASE as long as possible. */
2665 STRIP_NOPS (off);
2666 while (offtype == NULL_TREE)
2668 enum tree_code code;
2669 tree op0, op1, add = NULL_TREE;
2671 if (TREE_CODE (off) == SSA_NAME)
2673 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2675 if (expr_invariant_in_loop_p (loop, off))
2676 return NULL_TREE;
2678 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2679 break;
2681 op0 = gimple_assign_rhs1 (def_stmt);
2682 code = gimple_assign_rhs_code (def_stmt);
2683 op1 = gimple_assign_rhs2 (def_stmt);
2685 else
2687 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2688 return NULL_TREE;
2689 code = TREE_CODE (off);
2690 extract_ops_from_tree (off, &code, &op0, &op1);
2692 switch (code)
2694 case POINTER_PLUS_EXPR:
2695 case PLUS_EXPR:
2696 if (expr_invariant_in_loop_p (loop, op0))
2698 add = op0;
2699 off = op1;
2700 do_add:
2701 add = fold_convert (sizetype, add);
2702 if (scale != 1)
2703 add = size_binop (MULT_EXPR, add, size_int (scale));
2704 base = size_binop (PLUS_EXPR, base, add);
2705 continue;
2707 if (expr_invariant_in_loop_p (loop, op1))
2709 add = op1;
2710 off = op0;
2711 goto do_add;
2713 break;
2714 case MINUS_EXPR:
2715 if (expr_invariant_in_loop_p (loop, op1))
2717 add = fold_convert (sizetype, op1);
2718 add = size_binop (MINUS_EXPR, size_zero_node, add);
2719 off = op0;
2720 goto do_add;
2722 break;
2723 case MULT_EXPR:
2724 if (scale == 1 && host_integerp (op1, 0))
2726 scale = tree_low_cst (op1, 0);
2727 off = op0;
2728 continue;
2730 break;
2731 case SSA_NAME:
2732 off = op0;
2733 continue;
2734 CASE_CONVERT:
2735 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2736 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2737 break;
2738 if (TYPE_PRECISION (TREE_TYPE (op0))
2739 == TYPE_PRECISION (TREE_TYPE (off)))
2741 off = op0;
2742 continue;
2744 if (TYPE_PRECISION (TREE_TYPE (op0))
2745 < TYPE_PRECISION (TREE_TYPE (off)))
2747 off = op0;
2748 offtype = TREE_TYPE (off);
2749 STRIP_NOPS (off);
2750 continue;
2752 break;
2753 default:
2754 break;
2756 break;
2759 /* If at the end OFF still isn't a SSA_NAME or isn't
2760 defined in the loop, punt. */
2761 if (TREE_CODE (off) != SSA_NAME
2762 || expr_invariant_in_loop_p (loop, off))
2763 return NULL_TREE;
2765 if (offtype == NULL_TREE)
2766 offtype = TREE_TYPE (off);
2768 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2769 offtype, scale);
2770 if (decl == NULL_TREE)
2771 return NULL_TREE;
2773 if (basep)
2774 *basep = base;
2775 if (offp)
2776 *offp = off;
2777 if (scalep)
2778 *scalep = scale;
2779 return decl;
2782 /* Function vect_analyze_data_refs.
2784 Find all the data references in the loop or basic block.
2786 The general structure of the analysis of data refs in the vectorizer is as
2787 follows:
2788 1- vect_analyze_data_refs(loop/bb): call
2789 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2790 in the loop/bb and their dependences.
2791 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2792 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2793 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2797 bool
2798 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2799 bb_vec_info bb_vinfo,
2800 int *min_vf)
2802 struct loop *loop = NULL;
2803 basic_block bb = NULL;
2804 unsigned int i;
2805 vec<data_reference_p> datarefs;
2806 struct data_reference *dr;
2807 tree scalar_type;
2809 if (dump_enabled_p ())
2810 dump_printf_loc (MSG_NOTE, vect_location,
2811 "=== vect_analyze_data_refs ===\n");
2813 if (loop_vinfo)
2815 loop = LOOP_VINFO_LOOP (loop_vinfo);
2816 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2817 || find_data_references_in_loop
2818 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2820 if (dump_enabled_p ())
2821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2822 "not vectorized: loop contains function calls"
2823 " or data references that cannot be analyzed");
2824 return false;
2827 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2829 else
2831 gimple_stmt_iterator gsi;
2833 bb = BB_VINFO_BB (bb_vinfo);
2834 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2836 gimple stmt = gsi_stmt (gsi);
2837 if (!find_data_references_in_stmt (NULL, stmt,
2838 &BB_VINFO_DATAREFS (bb_vinfo)))
2840 /* Mark the rest of the basic-block as unvectorizable. */
2841 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2843 stmt = gsi_stmt (gsi);
2844 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2846 break;
2850 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2853 /* Go through the data-refs, check that the analysis succeeded. Update
2854 pointer from stmt_vec_info struct to DR and vectype. */
2856 FOR_EACH_VEC_ELT (datarefs, i, dr)
2858 gimple stmt;
2859 stmt_vec_info stmt_info;
2860 tree base, offset, init;
2861 bool gather = false;
2862 int vf;
2864 again:
2865 if (!dr || !DR_REF (dr))
2867 if (dump_enabled_p ())
2868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2869 "not vectorized: unhandled data-ref ");
2870 return false;
2873 stmt = DR_STMT (dr);
2874 stmt_info = vinfo_for_stmt (stmt);
2876 /* Discard clobbers from the dataref vector. We will remove
2877 clobber stmts during vectorization. */
2878 if (gimple_clobber_p (stmt))
2880 if (i == datarefs.length () - 1)
2882 datarefs.pop ();
2883 break;
2885 datarefs[i] = datarefs.pop ();
2886 goto again;
2889 /* Check that analysis of the data-ref succeeded. */
2890 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2891 || !DR_STEP (dr))
2893 /* If target supports vector gather loads, see if they can't
2894 be used. */
2895 if (loop_vinfo
2896 && DR_IS_READ (dr)
2897 && !TREE_THIS_VOLATILE (DR_REF (dr))
2898 && targetm.vectorize.builtin_gather != NULL
2899 && !nested_in_vect_loop_p (loop, stmt))
2901 struct data_reference *newdr
2902 = create_data_ref (NULL, loop_containing_stmt (stmt),
2903 DR_REF (dr), stmt, true);
2904 gcc_assert (newdr != NULL && DR_REF (newdr));
2905 if (DR_BASE_ADDRESS (newdr)
2906 && DR_OFFSET (newdr)
2907 && DR_INIT (newdr)
2908 && DR_STEP (newdr)
2909 && integer_zerop (DR_STEP (newdr)))
2911 dr = newdr;
2912 gather = true;
2914 else
2915 free_data_ref (newdr);
2918 if (!gather)
2920 if (dump_enabled_p ())
2922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923 "not vectorized: data ref analysis "
2924 "failed ");
2925 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2928 if (bb_vinfo)
2929 break;
2931 return false;
2935 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
2937 if (dump_enabled_p ())
2938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2939 "not vectorized: base addr of dr is a "
2940 "constant");
2942 if (bb_vinfo)
2943 break;
2945 if (gather)
2946 free_data_ref (dr);
2947 return false;
2950 if (TREE_THIS_VOLATILE (DR_REF (dr)))
2952 if (dump_enabled_p ())
2954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2955 "not vectorized: volatile type ");
2956 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2959 if (bb_vinfo)
2960 break;
2962 return false;
2965 if (stmt_can_throw_internal (stmt))
2967 if (dump_enabled_p ())
2969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2970 "not vectorized: statement can throw an "
2971 "exception ");
2972 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2975 if (bb_vinfo)
2976 break;
2978 if (gather)
2979 free_data_ref (dr);
2980 return false;
2983 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
2984 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
2986 if (dump_enabled_p ())
2988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2989 "not vectorized: statement is bitfield "
2990 "access ");
2991 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2994 if (bb_vinfo)
2995 break;
2997 if (gather)
2998 free_data_ref (dr);
2999 return false;
3002 base = unshare_expr (DR_BASE_ADDRESS (dr));
3003 offset = unshare_expr (DR_OFFSET (dr));
3004 init = unshare_expr (DR_INIT (dr));
3006 if (is_gimple_call (stmt))
3008 if (dump_enabled_p ())
3010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3011 "not vectorized: dr in a call ");
3012 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3015 if (bb_vinfo)
3016 break;
3018 if (gather)
3019 free_data_ref (dr);
3020 return false;
3023 /* Update DR field in stmt_vec_info struct. */
3025 /* If the dataref is in an inner-loop of the loop that is considered for
3026 for vectorization, we also want to analyze the access relative to
3027 the outer-loop (DR contains information only relative to the
3028 inner-most enclosing loop). We do that by building a reference to the
3029 first location accessed by the inner-loop, and analyze it relative to
3030 the outer-loop. */
3031 if (loop && nested_in_vect_loop_p (loop, stmt))
3033 tree outer_step, outer_base, outer_init;
3034 HOST_WIDE_INT pbitsize, pbitpos;
3035 tree poffset;
3036 enum machine_mode pmode;
3037 int punsignedp, pvolatilep;
3038 affine_iv base_iv, offset_iv;
3039 tree dinit;
3041 /* Build a reference to the first location accessed by the
3042 inner-loop: *(BASE+INIT). (The first location is actually
3043 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3044 tree inner_base = build_fold_indirect_ref
3045 (fold_build_pointer_plus (base, init));
3047 if (dump_enabled_p ())
3049 dump_printf_loc (MSG_NOTE, vect_location,
3050 "analyze in outer-loop: ");
3051 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3054 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3055 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3056 gcc_assert (outer_base != NULL_TREE);
3058 if (pbitpos % BITS_PER_UNIT != 0)
3060 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3062 "failed: bit offset alignment.\n");
3063 return false;
3066 outer_base = build_fold_addr_expr (outer_base);
3067 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3068 &base_iv, false))
3070 if (dump_enabled_p ())
3071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3072 "failed: evolution of base is not affine.\n");
3073 return false;
3076 if (offset)
3078 if (poffset)
3079 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3080 poffset);
3081 else
3082 poffset = offset;
3085 if (!poffset)
3087 offset_iv.base = ssize_int (0);
3088 offset_iv.step = ssize_int (0);
3090 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3091 &offset_iv, false))
3093 if (dump_enabled_p ())
3094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3095 "evolution of offset is not affine.\n");
3096 return false;
3099 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3100 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3101 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3102 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3103 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3105 outer_step = size_binop (PLUS_EXPR,
3106 fold_convert (ssizetype, base_iv.step),
3107 fold_convert (ssizetype, offset_iv.step));
3109 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3110 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3111 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3112 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3113 STMT_VINFO_DR_OFFSET (stmt_info) =
3114 fold_convert (ssizetype, offset_iv.base);
3115 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3116 size_int (highest_pow2_factor (offset_iv.base));
3118 if (dump_enabled_p ())
3120 dump_printf_loc (MSG_NOTE, vect_location,
3121 "\touter base_address: ");
3122 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3123 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3124 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3125 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126 STMT_VINFO_DR_OFFSET (stmt_info));
3127 dump_printf (MSG_NOTE,
3128 "\n\touter constant offset from base address: ");
3129 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3130 STMT_VINFO_DR_INIT (stmt_info));
3131 dump_printf (MSG_NOTE, "\n\touter step: ");
3132 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3133 STMT_VINFO_DR_STEP (stmt_info));
3134 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3135 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3136 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3140 if (STMT_VINFO_DATA_REF (stmt_info))
3142 if (dump_enabled_p ())
3144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3145 "not vectorized: more than one data ref "
3146 "in stmt: ");
3147 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3150 if (bb_vinfo)
3151 break;
3153 if (gather)
3154 free_data_ref (dr);
3155 return false;
3158 STMT_VINFO_DATA_REF (stmt_info) = dr;
3160 /* Set vectype for STMT. */
3161 scalar_type = TREE_TYPE (DR_REF (dr));
3162 STMT_VINFO_VECTYPE (stmt_info) =
3163 get_vectype_for_scalar_type (scalar_type);
3164 if (!STMT_VINFO_VECTYPE (stmt_info))
3166 if (dump_enabled_p ())
3168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3169 "not vectorized: no vectype for stmt: ");
3170 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3171 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3172 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3173 scalar_type);
3176 if (bb_vinfo)
3177 break;
3179 if (gather)
3181 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3182 free_data_ref (dr);
3184 return false;
3186 else
3188 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE, vect_location,
3191 "got vectype for stmt: ");
3192 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3193 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3194 STMT_VINFO_VECTYPE (stmt_info));
3198 /* Adjust the minimal vectorization factor according to the
3199 vector type. */
3200 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3201 if (vf > *min_vf)
3202 *min_vf = vf;
3204 if (gather)
3206 tree off;
3208 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3209 if (gather
3210 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3211 gather = false;
3212 if (!gather)
3214 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3215 free_data_ref (dr);
3216 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3219 "not vectorized: not suitable for gather "
3220 "load ");
3221 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3223 return false;
3226 datarefs[i] = dr;
3227 STMT_VINFO_GATHER_P (stmt_info) = true;
3229 else if (loop_vinfo
3230 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3232 if (nested_in_vect_loop_p (loop, stmt)
3233 || !DR_IS_READ (dr))
3235 if (dump_enabled_p ())
3237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3238 "not vectorized: not suitable for strided "
3239 "load ");
3240 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3242 return false;
3244 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3248 /* If we stopped analysis at the first dataref we could not analyze
3249 when trying to vectorize a basic-block mark the rest of the datarefs
3250 as not vectorizable and truncate the vector of datarefs. That
3251 avoids spending useless time in analyzing their dependence. */
3252 if (i != datarefs.length ())
3254 gcc_assert (bb_vinfo != NULL);
3255 for (unsigned j = i; j < datarefs.length (); ++j)
3257 data_reference_p dr = datarefs[j];
3258 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3259 free_data_ref (dr);
3261 datarefs.truncate (i);
3264 return true;
3268 /* Function vect_get_new_vect_var.
3270 Returns a name for a new variable. The current naming scheme appends the
3271 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3272 the name of vectorizer generated variables, and appends that to NAME if
3273 provided. */
3275 tree
3276 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3278 const char *prefix;
3279 tree new_vect_var;
3281 switch (var_kind)
3283 case vect_simple_var:
3284 prefix = "vect";
3285 break;
3286 case vect_scalar_var:
3287 prefix = "stmp";
3288 break;
3289 case vect_pointer_var:
3290 prefix = "vectp";
3291 break;
3292 default:
3293 gcc_unreachable ();
3296 if (name)
3298 char* tmp = concat (prefix, "_", name, NULL);
3299 new_vect_var = create_tmp_reg (type, tmp);
3300 free (tmp);
3302 else
3303 new_vect_var = create_tmp_reg (type, prefix);
3305 return new_vect_var;
3309 /* Function vect_create_addr_base_for_vector_ref.
3311 Create an expression that computes the address of the first memory location
3312 that will be accessed for a data reference.
3314 Input:
3315 STMT: The statement containing the data reference.
3316 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3317 OFFSET: Optional. If supplied, it is be added to the initial address.
3318 LOOP: Specify relative to which loop-nest should the address be computed.
3319 For example, when the dataref is in an inner-loop nested in an
3320 outer-loop that is now being vectorized, LOOP can be either the
3321 outer-loop, or the inner-loop. The first memory location accessed
3322 by the following dataref ('in' points to short):
3324 for (i=0; i<N; i++)
3325 for (j=0; j<M; j++)
3326 s += in[i+j]
3328 is as follows:
3329 if LOOP=i_loop: &in (relative to i_loop)
3330 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3332 Output:
3333 1. Return an SSA_NAME whose value is the address of the memory location of
3334 the first vector of the data reference.
3335 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3336 these statement(s) which define the returned SSA_NAME.
3338 FORNOW: We are only handling array accesses with step 1. */
3340 tree
3341 vect_create_addr_base_for_vector_ref (gimple stmt,
3342 gimple_seq *new_stmt_list,
3343 tree offset,
3344 struct loop *loop)
3346 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3347 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3348 tree data_ref_base;
3349 const char *base_name;
3350 tree addr_base;
3351 tree dest;
3352 gimple_seq seq = NULL;
3353 tree base_offset;
3354 tree init;
3355 tree vect_ptr_type;
3356 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3357 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3359 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3361 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3363 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3365 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3366 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3367 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3369 else
3371 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3372 base_offset = unshare_expr (DR_OFFSET (dr));
3373 init = unshare_expr (DR_INIT (dr));
3376 if (loop_vinfo)
3377 base_name = get_name (data_ref_base);
3378 else
3380 base_offset = ssize_int (0);
3381 init = ssize_int (0);
3382 base_name = get_name (DR_REF (dr));
3385 /* Create base_offset */
3386 base_offset = size_binop (PLUS_EXPR,
3387 fold_convert (sizetype, base_offset),
3388 fold_convert (sizetype, init));
3390 if (offset)
3392 offset = fold_build2 (MULT_EXPR, sizetype,
3393 fold_convert (sizetype, offset), step);
3394 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3395 base_offset, offset);
3398 /* base + base_offset */
3399 if (loop_vinfo)
3400 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3401 else
3403 addr_base = build1 (ADDR_EXPR,
3404 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3405 unshare_expr (DR_REF (dr)));
3408 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3409 addr_base = fold_convert (vect_ptr_type, addr_base);
3410 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3411 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3412 gimple_seq_add_seq (new_stmt_list, seq);
3414 if (DR_PTR_INFO (dr)
3415 && TREE_CODE (addr_base) == SSA_NAME)
3417 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3418 if (offset)
3419 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3422 if (dump_enabled_p ())
3424 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3425 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3428 return addr_base;
3432 /* Function vect_create_data_ref_ptr.
3434 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3435 location accessed in the loop by STMT, along with the def-use update
3436 chain to appropriately advance the pointer through the loop iterations.
3437 Also set aliasing information for the pointer. This pointer is used by
3438 the callers to this function to create a memory reference expression for
3439 vector load/store access.
3441 Input:
3442 1. STMT: a stmt that references memory. Expected to be of the form
3443 GIMPLE_ASSIGN <name, data-ref> or
3444 GIMPLE_ASSIGN <data-ref, name>.
3445 2. AGGR_TYPE: the type of the reference, which should be either a vector
3446 or an array.
3447 3. AT_LOOP: the loop where the vector memref is to be created.
3448 4. OFFSET (optional): an offset to be added to the initial address accessed
3449 by the data-ref in STMT.
3450 5. BSI: location where the new stmts are to be placed if there is no loop
3451 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3452 pointing to the initial address.
3454 Output:
3455 1. Declare a new ptr to vector_type, and have it point to the base of the
3456 data reference (initial addressed accessed by the data reference).
3457 For example, for vector of type V8HI, the following code is generated:
3459 v8hi *ap;
3460 ap = (v8hi *)initial_address;
3462 if OFFSET is not supplied:
3463 initial_address = &a[init];
3464 if OFFSET is supplied:
3465 initial_address = &a[init + OFFSET];
3467 Return the initial_address in INITIAL_ADDRESS.
3469 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3470 update the pointer in each iteration of the loop.
3472 Return the increment stmt that updates the pointer in PTR_INCR.
3474 3. Set INV_P to true if the access pattern of the data reference in the
3475 vectorized loop is invariant. Set it to false otherwise.
3477 4. Return the pointer. */
3479 tree
3480 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3481 tree offset, tree *initial_address,
3482 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3483 bool only_init, bool *inv_p)
3485 const char *base_name;
3486 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3487 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3488 struct loop *loop = NULL;
3489 bool nested_in_vect_loop = false;
3490 struct loop *containing_loop = NULL;
3491 tree aggr_ptr_type;
3492 tree aggr_ptr;
3493 tree new_temp;
3494 gimple vec_stmt;
3495 gimple_seq new_stmt_list = NULL;
3496 edge pe = NULL;
3497 basic_block new_bb;
3498 tree aggr_ptr_init;
3499 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3500 tree aptr;
3501 gimple_stmt_iterator incr_gsi;
3502 bool insert_after;
3503 tree indx_before_incr, indx_after_incr;
3504 gimple incr;
3505 tree step;
3506 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3508 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3509 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3511 if (loop_vinfo)
3513 loop = LOOP_VINFO_LOOP (loop_vinfo);
3514 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3515 containing_loop = (gimple_bb (stmt))->loop_father;
3516 pe = loop_preheader_edge (loop);
3518 else
3520 gcc_assert (bb_vinfo);
3521 only_init = true;
3522 *ptr_incr = NULL;
3525 /* Check the step (evolution) of the load in LOOP, and record
3526 whether it's invariant. */
3527 if (nested_in_vect_loop)
3528 step = STMT_VINFO_DR_STEP (stmt_info);
3529 else
3530 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3532 if (integer_zerop (step))
3533 *inv_p = true;
3534 else
3535 *inv_p = false;
3537 /* Create an expression for the first address accessed by this load
3538 in LOOP. */
3539 base_name = get_name (DR_BASE_ADDRESS (dr));
3541 if (dump_enabled_p ())
3543 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3544 dump_printf_loc (MSG_NOTE, vect_location,
3545 "create %s-pointer variable to type: ",
3546 tree_code_name[(int) TREE_CODE (aggr_type)]);
3547 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3548 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3549 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3550 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3551 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3552 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3553 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3554 else
3555 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3556 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3559 /* (1) Create the new aggregate-pointer variable.
3560 Vector and array types inherit the alias set of their component
3561 type by default so we need to use a ref-all pointer if the data
3562 reference does not conflict with the created aggregated data
3563 reference because it is not addressable. */
3564 bool need_ref_all = false;
3565 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3566 get_alias_set (DR_REF (dr))))
3567 need_ref_all = true;
3568 /* Likewise for any of the data references in the stmt group. */
3569 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3571 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3574 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3575 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3576 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3577 get_alias_set (DR_REF (sdr))))
3579 need_ref_all = true;
3580 break;
3582 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3584 while (orig_stmt);
3586 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3587 need_ref_all);
3588 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3591 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3592 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3593 def-use update cycles for the pointer: one relative to the outer-loop
3594 (LOOP), which is what steps (3) and (4) below do. The other is relative
3595 to the inner-loop (which is the inner-most loop containing the dataref),
3596 and this is done be step (5) below.
3598 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3599 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3600 redundant. Steps (3),(4) create the following:
3602 vp0 = &base_addr;
3603 LOOP: vp1 = phi(vp0,vp2)
3606 vp2 = vp1 + step
3607 goto LOOP
3609 If there is an inner-loop nested in loop, then step (5) will also be
3610 applied, and an additional update in the inner-loop will be created:
3612 vp0 = &base_addr;
3613 LOOP: vp1 = phi(vp0,vp2)
3615 inner: vp3 = phi(vp1,vp4)
3616 vp4 = vp3 + inner_step
3617 if () goto inner
3619 vp2 = vp1 + step
3620 if () goto LOOP */
3622 /* (2) Calculate the initial address of the aggregate-pointer, and set
3623 the aggregate-pointer to point to it before the loop. */
3625 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3627 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3628 offset, loop);
3629 if (new_stmt_list)
3631 if (pe)
3633 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3634 gcc_assert (!new_bb);
3636 else
3637 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3640 *initial_address = new_temp;
3642 /* Create: p = (aggr_type *) initial_base */
3643 if (TREE_CODE (new_temp) != SSA_NAME
3644 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3646 vec_stmt = gimple_build_assign (aggr_ptr,
3647 fold_convert (aggr_ptr_type, new_temp));
3648 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3649 /* Copy the points-to information if it exists. */
3650 if (DR_PTR_INFO (dr))
3651 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3652 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3653 if (pe)
3655 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3656 gcc_assert (!new_bb);
3658 else
3659 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3661 else
3662 aggr_ptr_init = new_temp;
3664 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3665 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3666 inner-loop nested in LOOP (during outer-loop vectorization). */
3668 /* No update in loop is required. */
3669 if (only_init && (!loop_vinfo || at_loop == loop))
3670 aptr = aggr_ptr_init;
3671 else
3673 /* The step of the aggregate pointer is the type size. */
3674 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3675 /* One exception to the above is when the scalar step of the load in
3676 LOOP is zero. In this case the step here is also zero. */
3677 if (*inv_p)
3678 iv_step = size_zero_node;
3679 else if (tree_int_cst_sgn (step) == -1)
3680 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3682 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3684 create_iv (aggr_ptr_init,
3685 fold_convert (aggr_ptr_type, iv_step),
3686 aggr_ptr, loop, &incr_gsi, insert_after,
3687 &indx_before_incr, &indx_after_incr);
3688 incr = gsi_stmt (incr_gsi);
3689 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3691 /* Copy the points-to information if it exists. */
3692 if (DR_PTR_INFO (dr))
3694 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3695 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3697 if (ptr_incr)
3698 *ptr_incr = incr;
3700 aptr = indx_before_incr;
3703 if (!nested_in_vect_loop || only_init)
3704 return aptr;
3707 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3708 nested in LOOP, if exists. */
3710 gcc_assert (nested_in_vect_loop);
3711 if (!only_init)
3713 standard_iv_increment_position (containing_loop, &incr_gsi,
3714 &insert_after);
3715 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3716 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3717 &indx_after_incr);
3718 incr = gsi_stmt (incr_gsi);
3719 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3721 /* Copy the points-to information if it exists. */
3722 if (DR_PTR_INFO (dr))
3724 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3725 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3727 if (ptr_incr)
3728 *ptr_incr = incr;
3730 return indx_before_incr;
3732 else
3733 gcc_unreachable ();
3737 /* Function bump_vector_ptr
3739 Increment a pointer (to a vector type) by vector-size. If requested,
3740 i.e. if PTR-INCR is given, then also connect the new increment stmt
3741 to the existing def-use update-chain of the pointer, by modifying
3742 the PTR_INCR as illustrated below:
3744 The pointer def-use update-chain before this function:
3745 DATAREF_PTR = phi (p_0, p_2)
3746 ....
3747 PTR_INCR: p_2 = DATAREF_PTR + step
3749 The pointer def-use update-chain after this function:
3750 DATAREF_PTR = phi (p_0, p_2)
3751 ....
3752 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3753 ....
3754 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3756 Input:
3757 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3758 in the loop.
3759 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3760 the loop. The increment amount across iterations is expected
3761 to be vector_size.
3762 BSI - location where the new update stmt is to be placed.
3763 STMT - the original scalar memory-access stmt that is being vectorized.
3764 BUMP - optional. The offset by which to bump the pointer. If not given,
3765 the offset is assumed to be vector_size.
3767 Output: Return NEW_DATAREF_PTR as illustrated above.
3771 tree
3772 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3773 gimple stmt, tree bump)
3775 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3776 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3777 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3778 tree update = TYPE_SIZE_UNIT (vectype);
3779 gimple incr_stmt;
3780 ssa_op_iter iter;
3781 use_operand_p use_p;
3782 tree new_dataref_ptr;
3784 if (bump)
3785 update = bump;
3787 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3788 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3789 dataref_ptr, update);
3790 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3792 /* Copy the points-to information if it exists. */
3793 if (DR_PTR_INFO (dr))
3795 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3796 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3799 if (!ptr_incr)
3800 return new_dataref_ptr;
3802 /* Update the vector-pointer's cross-iteration increment. */
3803 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3805 tree use = USE_FROM_PTR (use_p);
3807 if (use == dataref_ptr)
3808 SET_USE (use_p, new_dataref_ptr);
3809 else
3810 gcc_assert (tree_int_cst_compare (use, update) == 0);
3813 return new_dataref_ptr;
3817 /* Function vect_create_destination_var.
3819 Create a new temporary of type VECTYPE. */
3821 tree
3822 vect_create_destination_var (tree scalar_dest, tree vectype)
3824 tree vec_dest;
3825 const char *name;
3826 char *new_name;
3827 tree type;
3828 enum vect_var_kind kind;
3830 kind = vectype ? vect_simple_var : vect_scalar_var;
3831 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3833 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3835 name = get_name (scalar_dest);
3836 if (name)
3837 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
3838 else
3839 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
3840 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3841 free (new_name);
3843 return vec_dest;
3846 /* Function vect_grouped_store_supported.
3848 Returns TRUE if interleave high and interleave low permutations
3849 are supported, and FALSE otherwise. */
3851 bool
3852 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3854 enum machine_mode mode = TYPE_MODE (vectype);
3856 /* vect_permute_store_chain requires the group size to be a power of two. */
3857 if (exact_log2 (count) == -1)
3859 if (dump_enabled_p ())
3860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3861 "the size of the group of accesses"
3862 " is not a power of 2");
3863 return false;
3866 /* Check that the permutation is supported. */
3867 if (VECTOR_MODE_P (mode))
3869 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3870 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3871 for (i = 0; i < nelt / 2; i++)
3873 sel[i * 2] = i;
3874 sel[i * 2 + 1] = i + nelt;
3876 if (can_vec_perm_p (mode, false, sel))
3878 for (i = 0; i < nelt; i++)
3879 sel[i] += nelt / 2;
3880 if (can_vec_perm_p (mode, false, sel))
3881 return true;
3885 if (dump_enabled_p ())
3886 dump_printf (MSG_MISSED_OPTIMIZATION,
3887 "interleave op not supported by target.");
3888 return false;
3892 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3893 type VECTYPE. */
3895 bool
3896 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3898 return vect_lanes_optab_supported_p ("vec_store_lanes",
3899 vec_store_lanes_optab,
3900 vectype, count);
3904 /* Function vect_permute_store_chain.
3906 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3907 a power of 2, generate interleave_high/low stmts to reorder the data
3908 correctly for the stores. Return the final references for stores in
3909 RESULT_CHAIN.
3911 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3912 The input is 4 vectors each containing 8 elements. We assign a number to
3913 each element, the input sequence is:
3915 1st vec: 0 1 2 3 4 5 6 7
3916 2nd vec: 8 9 10 11 12 13 14 15
3917 3rd vec: 16 17 18 19 20 21 22 23
3918 4th vec: 24 25 26 27 28 29 30 31
3920 The output sequence should be:
3922 1st vec: 0 8 16 24 1 9 17 25
3923 2nd vec: 2 10 18 26 3 11 19 27
3924 3rd vec: 4 12 20 28 5 13 21 30
3925 4th vec: 6 14 22 30 7 15 23 31
3927 i.e., we interleave the contents of the four vectors in their order.
3929 We use interleave_high/low instructions to create such output. The input of
3930 each interleave_high/low operation is two vectors:
3931 1st vec 2nd vec
3932 0 1 2 3 4 5 6 7
3933 the even elements of the result vector are obtained left-to-right from the
3934 high/low elements of the first vector. The odd elements of the result are
3935 obtained left-to-right from the high/low elements of the second vector.
3936 The output of interleave_high will be: 0 4 1 5
3937 and of interleave_low: 2 6 3 7
3940 The permutation is done in log LENGTH stages. In each stage interleave_high
3941 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3942 where the first argument is taken from the first half of DR_CHAIN and the
3943 second argument from it's second half.
3944 In our example,
3946 I1: interleave_high (1st vec, 3rd vec)
3947 I2: interleave_low (1st vec, 3rd vec)
3948 I3: interleave_high (2nd vec, 4th vec)
3949 I4: interleave_low (2nd vec, 4th vec)
3951 The output for the first stage is:
3953 I1: 0 16 1 17 2 18 3 19
3954 I2: 4 20 5 21 6 22 7 23
3955 I3: 8 24 9 25 10 26 11 27
3956 I4: 12 28 13 29 14 30 15 31
3958 The output of the second stage, i.e. the final result is:
3960 I1: 0 8 16 24 1 9 17 25
3961 I2: 2 10 18 26 3 11 19 27
3962 I3: 4 12 20 28 5 13 21 30
3963 I4: 6 14 22 30 7 15 23 31. */
3965 void
3966 vect_permute_store_chain (vec<tree> dr_chain,
3967 unsigned int length,
3968 gimple stmt,
3969 gimple_stmt_iterator *gsi,
3970 vec<tree> *result_chain)
3972 tree vect1, vect2, high, low;
3973 gimple perm_stmt;
3974 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3975 tree perm_mask_low, perm_mask_high;
3976 unsigned int i, n;
3977 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
3978 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3980 result_chain->quick_grow (length);
3981 memcpy (result_chain->address (), dr_chain.address (),
3982 length * sizeof (tree));
3984 for (i = 0, n = nelt / 2; i < n; i++)
3986 sel[i * 2] = i;
3987 sel[i * 2 + 1] = i + nelt;
3989 perm_mask_high = vect_gen_perm_mask (vectype, sel);
3990 gcc_assert (perm_mask_high != NULL);
3992 for (i = 0; i < nelt; i++)
3993 sel[i] += nelt / 2;
3994 perm_mask_low = vect_gen_perm_mask (vectype, sel);
3995 gcc_assert (perm_mask_low != NULL);
3997 for (i = 0, n = exact_log2 (length); i < n; i++)
3999 for (j = 0; j < length/2; j++)
4001 vect1 = dr_chain[j];
4002 vect2 = dr_chain[j+length/2];
4004 /* Create interleaving stmt:
4005 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4006 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4007 perm_stmt
4008 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4009 vect1, vect2, perm_mask_high);
4010 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4011 (*result_chain)[2*j] = high;
4013 /* Create interleaving stmt:
4014 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4015 nelt*3/2+1, ...}> */
4016 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4017 perm_stmt
4018 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4019 vect1, vect2, perm_mask_low);
4020 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4021 (*result_chain)[2*j+1] = low;
4023 memcpy (dr_chain.address (), result_chain->address (),
4024 length * sizeof (tree));
4028 /* Function vect_setup_realignment
4030 This function is called when vectorizing an unaligned load using
4031 the dr_explicit_realign[_optimized] scheme.
4032 This function generates the following code at the loop prolog:
4034 p = initial_addr;
4035 x msq_init = *(floor(p)); # prolog load
4036 realignment_token = call target_builtin;
4037 loop:
4038 x msq = phi (msq_init, ---)
4040 The stmts marked with x are generated only for the case of
4041 dr_explicit_realign_optimized.
4043 The code above sets up a new (vector) pointer, pointing to the first
4044 location accessed by STMT, and a "floor-aligned" load using that pointer.
4045 It also generates code to compute the "realignment-token" (if the relevant
4046 target hook was defined), and creates a phi-node at the loop-header bb
4047 whose arguments are the result of the prolog-load (created by this
4048 function) and the result of a load that takes place in the loop (to be
4049 created by the caller to this function).
4051 For the case of dr_explicit_realign_optimized:
4052 The caller to this function uses the phi-result (msq) to create the
4053 realignment code inside the loop, and sets up the missing phi argument,
4054 as follows:
4055 loop:
4056 msq = phi (msq_init, lsq)
4057 lsq = *(floor(p')); # load in loop
4058 result = realign_load (msq, lsq, realignment_token);
4060 For the case of dr_explicit_realign:
4061 loop:
4062 msq = *(floor(p)); # load in loop
4063 p' = p + (VS-1);
4064 lsq = *(floor(p')); # load in loop
4065 result = realign_load (msq, lsq, realignment_token);
4067 Input:
4068 STMT - (scalar) load stmt to be vectorized. This load accesses
4069 a memory location that may be unaligned.
4070 BSI - place where new code is to be inserted.
4071 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4072 is used.
4074 Output:
4075 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4076 target hook, if defined.
4077 Return value - the result of the loop-header phi node. */
4079 tree
4080 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4081 tree *realignment_token,
4082 enum dr_alignment_support alignment_support_scheme,
4083 tree init_addr,
4084 struct loop **at_loop)
4086 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4087 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4088 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4089 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4090 struct loop *loop = NULL;
4091 edge pe = NULL;
4092 tree scalar_dest = gimple_assign_lhs (stmt);
4093 tree vec_dest;
4094 gimple inc;
4095 tree ptr;
4096 tree data_ref;
4097 gimple new_stmt;
4098 basic_block new_bb;
4099 tree msq_init = NULL_TREE;
4100 tree new_temp;
4101 gimple phi_stmt;
4102 tree msq = NULL_TREE;
4103 gimple_seq stmts = NULL;
4104 bool inv_p;
4105 bool compute_in_loop = false;
4106 bool nested_in_vect_loop = false;
4107 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4108 struct loop *loop_for_initial_load = NULL;
4110 if (loop_vinfo)
4112 loop = LOOP_VINFO_LOOP (loop_vinfo);
4113 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4116 gcc_assert (alignment_support_scheme == dr_explicit_realign
4117 || alignment_support_scheme == dr_explicit_realign_optimized);
4119 /* We need to generate three things:
4120 1. the misalignment computation
4121 2. the extra vector load (for the optimized realignment scheme).
4122 3. the phi node for the two vectors from which the realignment is
4123 done (for the optimized realignment scheme). */
4125 /* 1. Determine where to generate the misalignment computation.
4127 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4128 calculation will be generated by this function, outside the loop (in the
4129 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4130 caller, inside the loop.
4132 Background: If the misalignment remains fixed throughout the iterations of
4133 the loop, then both realignment schemes are applicable, and also the
4134 misalignment computation can be done outside LOOP. This is because we are
4135 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4136 are a multiple of VS (the Vector Size), and therefore the misalignment in
4137 different vectorized LOOP iterations is always the same.
4138 The problem arises only if the memory access is in an inner-loop nested
4139 inside LOOP, which is now being vectorized using outer-loop vectorization.
4140 This is the only case when the misalignment of the memory access may not
4141 remain fixed throughout the iterations of the inner-loop (as explained in
4142 detail in vect_supportable_dr_alignment). In this case, not only is the
4143 optimized realignment scheme not applicable, but also the misalignment
4144 computation (and generation of the realignment token that is passed to
4145 REALIGN_LOAD) have to be done inside the loop.
4147 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4148 or not, which in turn determines if the misalignment is computed inside
4149 the inner-loop, or outside LOOP. */
4151 if (init_addr != NULL_TREE || !loop_vinfo)
4153 compute_in_loop = true;
4154 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4158 /* 2. Determine where to generate the extra vector load.
4160 For the optimized realignment scheme, instead of generating two vector
4161 loads in each iteration, we generate a single extra vector load in the
4162 preheader of the loop, and in each iteration reuse the result of the
4163 vector load from the previous iteration. In case the memory access is in
4164 an inner-loop nested inside LOOP, which is now being vectorized using
4165 outer-loop vectorization, we need to determine whether this initial vector
4166 load should be generated at the preheader of the inner-loop, or can be
4167 generated at the preheader of LOOP. If the memory access has no evolution
4168 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4169 to be generated inside LOOP (in the preheader of the inner-loop). */
4171 if (nested_in_vect_loop)
4173 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4174 bool invariant_in_outerloop =
4175 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4176 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4178 else
4179 loop_for_initial_load = loop;
4180 if (at_loop)
4181 *at_loop = loop_for_initial_load;
4183 if (loop_for_initial_load)
4184 pe = loop_preheader_edge (loop_for_initial_load);
4186 /* 3. For the case of the optimized realignment, create the first vector
4187 load at the loop preheader. */
4189 if (alignment_support_scheme == dr_explicit_realign_optimized)
4191 /* Create msq_init = *(floor(p1)) in the loop preheader */
4193 gcc_assert (!compute_in_loop);
4194 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4195 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4196 NULL_TREE, &init_addr, NULL, &inc,
4197 true, &inv_p);
4198 new_temp = copy_ssa_name (ptr, NULL);
4199 new_stmt = gimple_build_assign_with_ops
4200 (BIT_AND_EXPR, new_temp, ptr,
4201 build_int_cst (TREE_TYPE (ptr),
4202 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4203 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4204 gcc_assert (!new_bb);
4205 data_ref
4206 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4207 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4208 new_stmt = gimple_build_assign (vec_dest, data_ref);
4209 new_temp = make_ssa_name (vec_dest, new_stmt);
4210 gimple_assign_set_lhs (new_stmt, new_temp);
4211 if (pe)
4213 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4214 gcc_assert (!new_bb);
4216 else
4217 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4219 msq_init = gimple_assign_lhs (new_stmt);
4222 /* 4. Create realignment token using a target builtin, if available.
4223 It is done either inside the containing loop, or before LOOP (as
4224 determined above). */
4226 if (targetm.vectorize.builtin_mask_for_load)
4228 tree builtin_decl;
4230 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4231 if (!init_addr)
4233 /* Generate the INIT_ADDR computation outside LOOP. */
4234 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4235 NULL_TREE, loop);
4236 if (loop)
4238 pe = loop_preheader_edge (loop);
4239 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4240 gcc_assert (!new_bb);
4242 else
4243 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4246 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4247 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4248 vec_dest =
4249 vect_create_destination_var (scalar_dest,
4250 gimple_call_return_type (new_stmt));
4251 new_temp = make_ssa_name (vec_dest, new_stmt);
4252 gimple_call_set_lhs (new_stmt, new_temp);
4254 if (compute_in_loop)
4255 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4256 else
4258 /* Generate the misalignment computation outside LOOP. */
4259 pe = loop_preheader_edge (loop);
4260 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4261 gcc_assert (!new_bb);
4264 *realignment_token = gimple_call_lhs (new_stmt);
4266 /* The result of the CALL_EXPR to this builtin is determined from
4267 the value of the parameter and no global variables are touched
4268 which makes the builtin a "const" function. Requiring the
4269 builtin to have the "const" attribute makes it unnecessary
4270 to call mark_call_clobbered. */
4271 gcc_assert (TREE_READONLY (builtin_decl));
4274 if (alignment_support_scheme == dr_explicit_realign)
4275 return msq;
4277 gcc_assert (!compute_in_loop);
4278 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4281 /* 5. Create msq = phi <msq_init, lsq> in loop */
4283 pe = loop_preheader_edge (containing_loop);
4284 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4285 msq = make_ssa_name (vec_dest, NULL);
4286 phi_stmt = create_phi_node (msq, containing_loop->header);
4287 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4289 return msq;
4293 /* Function vect_grouped_load_supported.
4295 Returns TRUE if even and odd permutations are supported,
4296 and FALSE otherwise. */
4298 bool
4299 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4301 enum machine_mode mode = TYPE_MODE (vectype);
4303 /* vect_permute_load_chain requires the group size to be a power of two. */
4304 if (exact_log2 (count) == -1)
4306 if (dump_enabled_p ())
4307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4308 "the size of the group of accesses"
4309 " is not a power of 2");
4310 return false;
4313 /* Check that the permutation is supported. */
4314 if (VECTOR_MODE_P (mode))
4316 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4317 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4319 for (i = 0; i < nelt; i++)
4320 sel[i] = i * 2;
4321 if (can_vec_perm_p (mode, false, sel))
4323 for (i = 0; i < nelt; i++)
4324 sel[i] = i * 2 + 1;
4325 if (can_vec_perm_p (mode, false, sel))
4326 return true;
4330 if (dump_enabled_p ())
4331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4332 "extract even/odd not supported by target");
4333 return false;
4336 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4337 type VECTYPE. */
4339 bool
4340 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4342 return vect_lanes_optab_supported_p ("vec_load_lanes",
4343 vec_load_lanes_optab,
4344 vectype, count);
4347 /* Function vect_permute_load_chain.
4349 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4350 a power of 2, generate extract_even/odd stmts to reorder the input data
4351 correctly. Return the final references for loads in RESULT_CHAIN.
4353 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4354 The input is 4 vectors each containing 8 elements. We assign a number to each
4355 element, the input sequence is:
4357 1st vec: 0 1 2 3 4 5 6 7
4358 2nd vec: 8 9 10 11 12 13 14 15
4359 3rd vec: 16 17 18 19 20 21 22 23
4360 4th vec: 24 25 26 27 28 29 30 31
4362 The output sequence should be:
4364 1st vec: 0 4 8 12 16 20 24 28
4365 2nd vec: 1 5 9 13 17 21 25 29
4366 3rd vec: 2 6 10 14 18 22 26 30
4367 4th vec: 3 7 11 15 19 23 27 31
4369 i.e., the first output vector should contain the first elements of each
4370 interleaving group, etc.
4372 We use extract_even/odd instructions to create such output. The input of
4373 each extract_even/odd operation is two vectors
4374 1st vec 2nd vec
4375 0 1 2 3 4 5 6 7
4377 and the output is the vector of extracted even/odd elements. The output of
4378 extract_even will be: 0 2 4 6
4379 and of extract_odd: 1 3 5 7
4382 The permutation is done in log LENGTH stages. In each stage extract_even
4383 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4384 their order. In our example,
4386 E1: extract_even (1st vec, 2nd vec)
4387 E2: extract_odd (1st vec, 2nd vec)
4388 E3: extract_even (3rd vec, 4th vec)
4389 E4: extract_odd (3rd vec, 4th vec)
4391 The output for the first stage will be:
4393 E1: 0 2 4 6 8 10 12 14
4394 E2: 1 3 5 7 9 11 13 15
4395 E3: 16 18 20 22 24 26 28 30
4396 E4: 17 19 21 23 25 27 29 31
4398 In order to proceed and create the correct sequence for the next stage (or
4399 for the correct output, if the second stage is the last one, as in our
4400 example), we first put the output of extract_even operation and then the
4401 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4402 The input for the second stage is:
4404 1st vec (E1): 0 2 4 6 8 10 12 14
4405 2nd vec (E3): 16 18 20 22 24 26 28 30
4406 3rd vec (E2): 1 3 5 7 9 11 13 15
4407 4th vec (E4): 17 19 21 23 25 27 29 31
4409 The output of the second stage:
4411 E1: 0 4 8 12 16 20 24 28
4412 E2: 2 6 10 14 18 22 26 30
4413 E3: 1 5 9 13 17 21 25 29
4414 E4: 3 7 11 15 19 23 27 31
4416 And RESULT_CHAIN after reordering:
4418 1st vec (E1): 0 4 8 12 16 20 24 28
4419 2nd vec (E3): 1 5 9 13 17 21 25 29
4420 3rd vec (E2): 2 6 10 14 18 22 26 30
4421 4th vec (E4): 3 7 11 15 19 23 27 31. */
4423 static void
4424 vect_permute_load_chain (vec<tree> dr_chain,
4425 unsigned int length,
4426 gimple stmt,
4427 gimple_stmt_iterator *gsi,
4428 vec<tree> *result_chain)
4430 tree data_ref, first_vect, second_vect;
4431 tree perm_mask_even, perm_mask_odd;
4432 gimple perm_stmt;
4433 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4434 unsigned int i, j, log_length = exact_log2 (length);
4435 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4436 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4438 result_chain->quick_grow (length);
4439 memcpy (result_chain->address (), dr_chain.address (),
4440 length * sizeof (tree));
4442 for (i = 0; i < nelt; ++i)
4443 sel[i] = i * 2;
4444 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4445 gcc_assert (perm_mask_even != NULL);
4447 for (i = 0; i < nelt; ++i)
4448 sel[i] = i * 2 + 1;
4449 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4450 gcc_assert (perm_mask_odd != NULL);
4452 for (i = 0; i < log_length; i++)
4454 for (j = 0; j < length; j += 2)
4456 first_vect = dr_chain[j];
4457 second_vect = dr_chain[j+1];
4459 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4460 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4461 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4462 first_vect, second_vect,
4463 perm_mask_even);
4464 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4465 (*result_chain)[j/2] = data_ref;
4467 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4468 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4469 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4470 first_vect, second_vect,
4471 perm_mask_odd);
4472 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4473 (*result_chain)[j/2+length/2] = data_ref;
4475 memcpy (dr_chain.address (), result_chain->address (),
4476 length * sizeof (tree));
4481 /* Function vect_transform_grouped_load.
4483 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4484 to perform their permutation and ascribe the result vectorized statements to
4485 the scalar statements.
4488 void
4489 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4490 gimple_stmt_iterator *gsi)
4492 vec<tree> result_chain = vNULL;
4494 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4495 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4496 vectors, that are ready for vector computation. */
4497 result_chain.create (size);
4498 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4499 vect_record_grouped_load_vectors (stmt, result_chain);
4500 result_chain.release ();
4503 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4504 generated as part of the vectorization of STMT. Assign the statement
4505 for each vector to the associated scalar statement. */
4507 void
4508 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4510 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4511 gimple next_stmt, new_stmt;
4512 unsigned int i, gap_count;
4513 tree tmp_data_ref;
4515 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4516 Since we scan the chain starting from it's first node, their order
4517 corresponds the order of data-refs in RESULT_CHAIN. */
4518 next_stmt = first_stmt;
4519 gap_count = 1;
4520 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4522 if (!next_stmt)
4523 break;
4525 /* Skip the gaps. Loads created for the gaps will be removed by dead
4526 code elimination pass later. No need to check for the first stmt in
4527 the group, since it always exists.
4528 GROUP_GAP is the number of steps in elements from the previous
4529 access (if there is no gap GROUP_GAP is 1). We skip loads that
4530 correspond to the gaps. */
4531 if (next_stmt != first_stmt
4532 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4534 gap_count++;
4535 continue;
4538 while (next_stmt)
4540 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4541 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4542 copies, and we put the new vector statement in the first available
4543 RELATED_STMT. */
4544 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4545 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4546 else
4548 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4550 gimple prev_stmt =
4551 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4552 gimple rel_stmt =
4553 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4554 while (rel_stmt)
4556 prev_stmt = rel_stmt;
4557 rel_stmt =
4558 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4561 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4562 new_stmt;
4566 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4567 gap_count = 1;
4568 /* If NEXT_STMT accesses the same DR as the previous statement,
4569 put the same TMP_DATA_REF as its vectorized statement; otherwise
4570 get the next data-ref from RESULT_CHAIN. */
4571 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4572 break;
4577 /* Function vect_force_dr_alignment_p.
4579 Returns whether the alignment of a DECL can be forced to be aligned
4580 on ALIGNMENT bit boundary. */
4582 bool
4583 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4585 if (TREE_CODE (decl) != VAR_DECL)
4586 return false;
4588 /* We cannot change alignment of common or external symbols as another
4589 translation unit may contain a definition with lower alignment.
4590 The rules of common symbol linking mean that the definition
4591 will override the common symbol. The same is true for constant
4592 pool entries which may be shared and are not properly merged
4593 by LTO. */
4594 if (DECL_EXTERNAL (decl)
4595 || DECL_COMMON (decl)
4596 || DECL_IN_CONSTANT_POOL (decl))
4597 return false;
4599 if (TREE_ASM_WRITTEN (decl))
4600 return false;
4602 /* Do not override the alignment as specified by the ABI when the used
4603 attribute is set. */
4604 if (DECL_PRESERVE_P (decl))
4605 return false;
4607 /* Do not override explicit alignment set by the user when an explicit
4608 section name is also used. This is a common idiom used by many
4609 software projects. */
4610 if (DECL_SECTION_NAME (decl) != NULL_TREE
4611 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4612 return false;
4614 if (TREE_STATIC (decl))
4615 return (alignment <= MAX_OFILE_ALIGNMENT);
4616 else
4617 return (alignment <= MAX_STACK_ALIGNMENT);
4621 /* Return whether the data reference DR is supported with respect to its
4622 alignment.
4623 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4624 it is aligned, i.e., check if it is possible to vectorize it with different
4625 alignment. */
4627 enum dr_alignment_support
4628 vect_supportable_dr_alignment (struct data_reference *dr,
4629 bool check_aligned_accesses)
4631 gimple stmt = DR_STMT (dr);
4632 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4633 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4634 enum machine_mode mode = TYPE_MODE (vectype);
4635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4636 struct loop *vect_loop = NULL;
4637 bool nested_in_vect_loop = false;
4639 if (aligned_access_p (dr) && !check_aligned_accesses)
4640 return dr_aligned;
4642 if (loop_vinfo)
4644 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4645 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4648 /* Possibly unaligned access. */
4650 /* We can choose between using the implicit realignment scheme (generating
4651 a misaligned_move stmt) and the explicit realignment scheme (generating
4652 aligned loads with a REALIGN_LOAD). There are two variants to the
4653 explicit realignment scheme: optimized, and unoptimized.
4654 We can optimize the realignment only if the step between consecutive
4655 vector loads is equal to the vector size. Since the vector memory
4656 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4657 is guaranteed that the misalignment amount remains the same throughout the
4658 execution of the vectorized loop. Therefore, we can create the
4659 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4660 at the loop preheader.
4662 However, in the case of outer-loop vectorization, when vectorizing a
4663 memory access in the inner-loop nested within the LOOP that is now being
4664 vectorized, while it is guaranteed that the misalignment of the
4665 vectorized memory access will remain the same in different outer-loop
4666 iterations, it is *not* guaranteed that is will remain the same throughout
4667 the execution of the inner-loop. This is because the inner-loop advances
4668 with the original scalar step (and not in steps of VS). If the inner-loop
4669 step happens to be a multiple of VS, then the misalignment remains fixed
4670 and we can use the optimized realignment scheme. For example:
4672 for (i=0; i<N; i++)
4673 for (j=0; j<M; j++)
4674 s += a[i+j];
4676 When vectorizing the i-loop in the above example, the step between
4677 consecutive vector loads is 1, and so the misalignment does not remain
4678 fixed across the execution of the inner-loop, and the realignment cannot
4679 be optimized (as illustrated in the following pseudo vectorized loop):
4681 for (i=0; i<N; i+=4)
4682 for (j=0; j<M; j++){
4683 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4684 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4685 // (assuming that we start from an aligned address).
4688 We therefore have to use the unoptimized realignment scheme:
4690 for (i=0; i<N; i+=4)
4691 for (j=k; j<M; j+=4)
4692 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4693 // that the misalignment of the initial address is
4694 // 0).
4696 The loop can then be vectorized as follows:
4698 for (k=0; k<4; k++){
4699 rt = get_realignment_token (&vp[k]);
4700 for (i=0; i<N; i+=4){
4701 v1 = vp[i+k];
4702 for (j=k; j<M; j+=4){
4703 v2 = vp[i+j+VS-1];
4704 va = REALIGN_LOAD <v1,v2,rt>;
4705 vs += va;
4706 v1 = v2;
4709 } */
4711 if (DR_IS_READ (dr))
4713 bool is_packed = false;
4714 tree type = (TREE_TYPE (DR_REF (dr)));
4716 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4717 && (!targetm.vectorize.builtin_mask_for_load
4718 || targetm.vectorize.builtin_mask_for_load ()))
4720 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4721 if ((nested_in_vect_loop
4722 && (TREE_INT_CST_LOW (DR_STEP (dr))
4723 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4724 || !loop_vinfo)
4725 return dr_explicit_realign;
4726 else
4727 return dr_explicit_realign_optimized;
4729 if (!known_alignment_for_access_p (dr))
4730 is_packed = not_size_aligned (DR_REF (dr));
4732 if ((TYPE_USER_ALIGN (type) && !is_packed)
4733 || targetm.vectorize.
4734 support_vector_misalignment (mode, type,
4735 DR_MISALIGNMENT (dr), is_packed))
4736 /* Can't software pipeline the loads, but can at least do them. */
4737 return dr_unaligned_supported;
4739 else
4741 bool is_packed = false;
4742 tree type = (TREE_TYPE (DR_REF (dr)));
4744 if (!known_alignment_for_access_p (dr))
4745 is_packed = not_size_aligned (DR_REF (dr));
4747 if ((TYPE_USER_ALIGN (type) && !is_packed)
4748 || targetm.vectorize.
4749 support_vector_misalignment (mode, type,
4750 DR_MISALIGNMENT (dr), is_packed))
4751 return dr_unaligned_supported;
4754 /* Unsupported. */
4755 return dr_unaligned_unsupported;