gcc/ChangeLog
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobc26d25dabbbcf2b39aaa813a68ab66ca2167ae09
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
42 #include "expr.h"
43 #include "optabs.h"
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
48 static bool
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
52 enum machine_mode mode, array_mode;
53 bool limit_p;
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
58 MODE_INT, limit_p);
60 if (array_mode == BLKmode)
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
66 return false;
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
75 return false;
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
83 return true;
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
102 types. */
104 tree
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
122 if (rhs < lhs)
123 scalar_type = rhs_type;
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
128 return scalar_type;
132 /* Check if data references pointed by DR_I and DR_J are same or
133 belong to same interleaving group. Return FALSE if drs are
134 different, otherwise return TRUE. */
136 static bool
137 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
139 gimple stmt_i = DR_STMT (dr_i);
140 gimple stmt_j = DR_STMT (dr_j);
142 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
143 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
144 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
145 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
146 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
147 return true;
148 else
149 return false;
152 /* If address ranges represented by DDR_I and DDR_J are equal,
153 return TRUE, otherwise return FALSE. */
155 static bool
156 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
158 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
159 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
160 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
161 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
162 return true;
163 else
164 return false;
167 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
168 tested at run-time. Return TRUE if DDR was successfully inserted.
169 Return false if versioning is not supported. */
171 static bool
172 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
174 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
177 return false;
179 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location,
182 "mark for run-time aliasing test between ");
183 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
184 dump_printf (MSG_NOTE, " and ");
185 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
188 if (optimize_loop_nest_for_size_p (loop))
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192 "versioning not supported when optimizing for size.");
193 return false;
196 /* FORNOW: We don't support versioning with outer-loop vectorization. */
197 if (loop->inner)
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
201 "versioning not yet supported for outer-loops.");
202 return false;
205 /* FORNOW: We don't support creating runtime alias tests for non-constant
206 step. */
207 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
208 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
210 if (dump_enabled_p ())
211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
212 "versioning not yet supported for non-constant "
213 "step");
214 return false;
217 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
218 return true;
222 /* Function vect_analyze_data_ref_dependence.
224 Return TRUE if there (might) exist a dependence between a memory-reference
225 DRA and a memory-reference DRB. When versioning for alias may check a
226 dependence at run-time, return FALSE. Adjust *MAX_VF according to
227 the data dependence. */
229 static bool
230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
231 loop_vec_info loop_vinfo, int *max_vf)
233 unsigned int i;
234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
235 struct data_reference *dra = DDR_A (ddr);
236 struct data_reference *drb = DDR_B (ddr);
237 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
238 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
239 lambda_vector dist_v;
240 unsigned int loop_depth;
242 /* In loop analysis all data references should be vectorizable. */
243 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
244 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
245 gcc_unreachable ();
247 /* Independent data accesses. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
249 return false;
251 if (dra == drb
252 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
253 return false;
255 /* Unknown data dependence. */
256 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
258 if (STMT_VINFO_GATHER_P (stmtinfo_a)
259 || STMT_VINFO_GATHER_P (stmtinfo_b))
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
264 "versioning for alias not supported for: "
265 "can't determine dependence between ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (dra));
268 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (drb));
272 return true;
275 if (dump_enabled_p ())
277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
278 "versioning for alias required: "
279 "can't determine dependence between ");
280 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
281 DR_REF (dra));
282 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (drb));
287 /* Add to list of ddrs that need to be tested at run-time. */
288 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
291 /* Known data dependence. */
292 if (DDR_NUM_DIST_VECTS (ddr) == 0)
294 if (STMT_VINFO_GATHER_P (stmtinfo_a)
295 || STMT_VINFO_GATHER_P (stmtinfo_b))
297 if (dump_enabled_p ())
299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
300 "versioning for alias not supported for: "
301 "bad dist vector for ");
302 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
303 DR_REF (dra));
304 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
306 DR_REF (drb));
308 return true;
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
314 "versioning for alias required: "
315 "bad dist vector for ");
316 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
317 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
320 /* Add to list of ddrs that need to be tested at run-time. */
321 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
324 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
325 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
327 int dist = dist_v[loop_depth];
329 if (dump_enabled_p ())
330 dump_printf_loc (MSG_NOTE, vect_location,
331 "dependence distance = %d.", dist);
333 if (dist == 0)
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location,
338 "dependence distance == 0 between ");
339 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
340 dump_printf (MSG_NOTE, " and ");
341 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
344 /* When we perform grouped accesses and perform implicit CSE
345 by detecting equal accesses and doing disambiguation with
346 runtime alias tests like for
347 .. = a[i];
348 .. = a[i+1];
349 a[i] = ..;
350 a[i+1] = ..;
351 *p = ..;
352 .. = a[i];
353 .. = a[i+1];
354 where we will end up loading { a[i], a[i+1] } once, make
355 sure that inserting group loads before the first load and
356 stores after the last store will do the right thing. */
357 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
358 && GROUP_SAME_DR_STMT (stmtinfo_a))
359 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
360 && GROUP_SAME_DR_STMT (stmtinfo_b)))
362 gimple earlier_stmt;
363 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
364 if (DR_IS_WRITE
365 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
367 if (dump_enabled_p ())
368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
369 "READ_WRITE dependence in interleaving.");
370 return true;
374 continue;
377 if (dist > 0 && DDR_REVERSED_P (ddr))
379 /* If DDR_REVERSED_P the order of the data-refs in DDR was
380 reversed (to make distance vector positive), and the actual
381 distance is negative. */
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "dependence distance negative.");
385 continue;
388 if (abs (dist) >= 2
389 && abs (dist) < *max_vf)
391 /* The dependence distance requires reduction of the maximal
392 vectorization factor. */
393 *max_vf = abs (dist);
394 if (dump_enabled_p ())
395 dump_printf_loc (MSG_NOTE, vect_location,
396 "adjusting maximal vectorization factor to %i",
397 *max_vf);
400 if (abs (dist) >= *max_vf)
402 /* Dependence distance does not create dependence, as far as
403 vectorization is concerned, in this case. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_NOTE, vect_location,
406 "dependence distance >= VF.");
407 continue;
410 if (dump_enabled_p ())
412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
413 "not vectorized, possible dependence "
414 "between data-refs ");
415 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
416 dump_printf (MSG_NOTE, " and ");
417 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
420 return true;
423 return false;
426 /* Function vect_analyze_data_ref_dependences.
428 Examine all the data references in the loop, and make sure there do not
429 exist any data dependences between them. Set *MAX_VF according to
430 the maximum vectorization factor the data dependences allow. */
432 bool
433 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
435 unsigned int i;
436 struct data_dependence_relation *ddr;
438 if (dump_enabled_p ())
439 dump_printf_loc (MSG_NOTE, vect_location,
440 "=== vect_analyze_data_ref_dependences ===");
442 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
443 &LOOP_VINFO_DDRS (loop_vinfo),
444 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
445 return false;
447 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
448 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
449 return false;
451 return true;
455 /* Function vect_slp_analyze_data_ref_dependence.
457 Return TRUE if there (might) exist a dependence between a memory-reference
458 DRA and a memory-reference DRB. When versioning for alias may check a
459 dependence at run-time, return FALSE. Adjust *MAX_VF according to
460 the data dependence. */
462 static bool
463 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
465 struct data_reference *dra = DDR_A (ddr);
466 struct data_reference *drb = DDR_B (ddr);
468 /* We need to check dependences of statements marked as unvectorizable
469 as well, they still can prohibit vectorization. */
471 /* Independent data accesses. */
472 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
473 return false;
475 if (dra == drb)
476 return false;
478 /* Read-read is OK. */
479 if (DR_IS_READ (dra) && DR_IS_READ (drb))
480 return false;
482 /* If dra and drb are part of the same interleaving chain consider
483 them independent. */
484 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
485 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
486 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
487 return false;
489 /* Unknown data dependence. */
490 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
492 gimple earlier_stmt;
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "can't determine dependence between ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
499 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
500 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
503 /* We do not vectorize basic blocks with write-write dependencies. */
504 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
505 return true;
507 /* Check that it's not a load-after-store dependence. */
508 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
509 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
510 return true;
512 return false;
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location,
518 "determined dependence between ");
519 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
520 dump_printf (MSG_NOTE, " and ");
521 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
524 /* Do not vectorize basic blocks with write-write dependences. */
525 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
526 return true;
528 /* Check dependence between DRA and DRB for basic block vectorization.
529 If the accesses share same bases and offsets, we can compare their initial
530 constant offsets to decide whether they differ or not. In case of a read-
531 write dependence we check that the load is before the store to ensure that
532 vectorization will not change the order of the accesses. */
534 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
535 gimple earlier_stmt;
537 /* Check that the data-refs have same bases and offsets. If not, we can't
538 determine if they are dependent. */
539 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
540 || !dr_equal_offsets_p (dra, drb))
541 return true;
543 /* Check the types. */
544 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
545 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
547 if (type_size_a != type_size_b
548 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
549 TREE_TYPE (DR_REF (drb))))
550 return true;
552 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
553 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
555 /* Two different locations - no dependence. */
556 if (init_a != init_b)
557 return false;
559 /* We have a read-write dependence. Check that the load is before the store.
560 When we vectorize basic blocks, vector load can be only before
561 corresponding scalar load, and vector store can be only after its
562 corresponding scalar store. So the order of the acceses is preserved in
563 case the load is before the store. */
564 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
565 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
566 return false;
568 return true;
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the basic-block, and make sure there
575 do not exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
578 bool
579 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
581 struct data_dependence_relation *ddr;
582 unsigned int i;
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_NOTE, vect_location,
586 "=== vect_slp_analyze_data_ref_dependences ===");
588 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
589 &BB_VINFO_DDRS (bb_vinfo),
590 vNULL, true))
591 return false;
593 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
594 if (vect_slp_analyze_data_ref_dependence (ddr))
595 return false;
597 return true;
601 /* Function vect_compute_data_ref_alignment
603 Compute the misalignment of the data reference DR.
605 Output:
606 1. If during the misalignment computation it is found that the data reference
607 cannot be vectorized then false is returned.
608 2. DR_MISALIGNMENT (DR) is defined.
610 FOR NOW: No analysis is actually performed. Misalignment is calculated
611 only for trivial cases. TODO. */
613 static bool
614 vect_compute_data_ref_alignment (struct data_reference *dr)
616 gimple stmt = DR_STMT (dr);
617 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
618 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
619 struct loop *loop = NULL;
620 tree ref = DR_REF (dr);
621 tree vectype;
622 tree base, base_addr;
623 bool base_aligned;
624 tree misalign;
625 tree aligned_to, alignment;
627 if (dump_enabled_p ())
628 dump_printf_loc (MSG_NOTE, vect_location,
629 "vect_compute_data_ref_alignment:");
631 if (loop_vinfo)
632 loop = LOOP_VINFO_LOOP (loop_vinfo);
634 /* Initialize misalignment to unknown. */
635 SET_DR_MISALIGNMENT (dr, -1);
637 /* Strided loads perform only component accesses, misalignment information
638 is irrelevant for them. */
639 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
640 return true;
642 misalign = DR_INIT (dr);
643 aligned_to = DR_ALIGNED_TO (dr);
644 base_addr = DR_BASE_ADDRESS (dr);
645 vectype = STMT_VINFO_VECTYPE (stmt_info);
647 /* In case the dataref is in an inner-loop of the loop that is being
648 vectorized (LOOP), we use the base and misalignment information
649 relative to the outer-loop (LOOP). This is ok only if the misalignment
650 stays the same throughout the execution of the inner-loop, which is why
651 we have to check that the stride of the dataref in the inner-loop evenly
652 divides by the vector size. */
653 if (loop && nested_in_vect_loop_p (loop, stmt))
655 tree step = DR_STEP (dr);
656 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
658 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "inner step divides the vector-size.");
663 misalign = STMT_VINFO_DR_INIT (stmt_info);
664 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
665 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
667 else
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
671 "inner step doesn't divide the vector-size.");
672 misalign = NULL_TREE;
676 /* Similarly, if we're doing basic-block vectorization, we can only use
677 base and misalignment information relative to an innermost loop if the
678 misalignment stays the same throughout the execution of the loop.
679 As above, this is the case if the stride of the dataref evenly divides
680 by the vector size. */
681 if (!loop)
683 tree step = DR_STEP (dr);
684 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
686 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "SLP: step doesn't divide the vector-size.");
691 misalign = NULL_TREE;
695 base = build_fold_indirect_ref (base_addr);
696 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
698 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
699 || !misalign)
701 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Unknown alignment for access: ");
705 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
707 return true;
710 if ((DECL_P (base)
711 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
712 alignment) >= 0)
713 || (TREE_CODE (base_addr) == SSA_NAME
714 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
715 TREE_TYPE (base_addr)))),
716 alignment) >= 0)
717 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
718 base_aligned = true;
719 else
720 base_aligned = false;
722 if (!base_aligned)
724 /* Do not change the alignment of global variables here if
725 flag_section_anchors is enabled as we already generated
726 RTL for other functions. Most global variables should
727 have been aligned during the IPA increase_alignment pass. */
728 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
729 || (TREE_STATIC (base) && flag_section_anchors))
731 if (dump_enabled_p ())
733 dump_printf_loc (MSG_NOTE, vect_location,
734 "can't force alignment of ref: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
737 return true;
740 /* Force the alignment of the decl.
741 NOTE: This is the only change to the code we make during
742 the analysis phase, before deciding to vectorize the loop. */
743 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
746 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
749 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
750 DECL_USER_ALIGN (base) = 1;
753 /* At this point we assume that the base is aligned. */
754 gcc_assert (base_aligned
755 || (TREE_CODE (base) == VAR_DECL
756 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
758 /* If this is a backward running DR then first access in the larger
759 vectype actually is N-1 elements before the address in the DR.
760 Adjust misalign accordingly. */
761 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
763 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
764 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
765 otherwise we wouldn't be here. */
766 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
767 /* PLUS because DR_STEP was negative. */
768 misalign = size_binop (PLUS_EXPR, misalign, offset);
771 /* Modulo alignment. */
772 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
774 if (!host_integerp (misalign, 1))
776 /* Negative or overflowed misalignment value. */
777 if (dump_enabled_p ())
778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
779 "unexpected misalign value");
780 return false;
783 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
788 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
789 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
792 return true;
796 /* Function vect_compute_data_refs_alignment
798 Compute the misalignment of data references in the loop.
799 Return FALSE if a data reference is found that cannot be vectorized. */
801 static bool
802 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
803 bb_vec_info bb_vinfo)
805 vec<data_reference_p> datarefs;
806 struct data_reference *dr;
807 unsigned int i;
809 if (loop_vinfo)
810 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
811 else
812 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
814 FOR_EACH_VEC_ELT (datarefs, i, dr)
815 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
816 && !vect_compute_data_ref_alignment (dr))
818 if (bb_vinfo)
820 /* Mark unsupported statement as unvectorizable. */
821 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
822 continue;
824 else
825 return false;
828 return true;
832 /* Function vect_update_misalignment_for_peel
834 DR - the data reference whose misalignment is to be adjusted.
835 DR_PEEL - the data reference whose misalignment is being made
836 zero in the vector loop by the peel.
837 NPEEL - the number of iterations in the peel loop if the misalignment
838 of DR_PEEL is known at compile time. */
840 static void
841 vect_update_misalignment_for_peel (struct data_reference *dr,
842 struct data_reference *dr_peel, int npeel)
844 unsigned int i;
845 vec<dr_p> same_align_drs;
846 struct data_reference *current_dr;
847 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
848 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
849 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
850 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
852 /* For interleaved data accesses the step in the loop must be multiplied by
853 the size of the interleaving group. */
854 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
855 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
856 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
857 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
859 /* It can be assumed that the data refs with the same alignment as dr_peel
860 are aligned in the vector loop. */
861 same_align_drs
862 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
863 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
865 if (current_dr != dr)
866 continue;
867 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
868 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
869 SET_DR_MISALIGNMENT (dr, 0);
870 return;
873 if (known_alignment_for_access_p (dr)
874 && known_alignment_for_access_p (dr_peel))
876 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
877 int misal = DR_MISALIGNMENT (dr);
878 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
879 misal += negative ? -npeel * dr_size : npeel * dr_size;
880 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
881 SET_DR_MISALIGNMENT (dr, misal);
882 return;
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
887 SET_DR_MISALIGNMENT (dr, -1);
891 /* Function vect_verify_datarefs_alignment
893 Return TRUE if all data references in the loop can be
894 handled with respect to alignment. */
896 bool
897 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
899 vec<data_reference_p> datarefs;
900 struct data_reference *dr;
901 enum dr_alignment_support supportable_dr_alignment;
902 unsigned int i;
904 if (loop_vinfo)
905 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
906 else
907 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
909 FOR_EACH_VEC_ELT (datarefs, i, dr)
911 gimple stmt = DR_STMT (dr);
912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
914 if (!STMT_VINFO_RELEVANT_P (stmt_info))
915 continue;
917 /* For interleaving, only the alignment of the first access matters.
918 Skip statements marked as not vectorizable. */
919 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
920 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
921 || !STMT_VINFO_VECTORIZABLE (stmt_info))
922 continue;
924 /* Strided loads perform only component accesses, alignment is
925 irrelevant for them. */
926 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
927 continue;
929 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
930 if (!supportable_dr_alignment)
932 if (dump_enabled_p ())
934 if (DR_IS_READ (dr))
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "not vectorized: unsupported unaligned load.");
937 else
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "not vectorized: unsupported unaligned "
940 "store.");
942 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
943 DR_REF (dr));
945 return false;
947 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
948 dump_printf_loc (MSG_NOTE, vect_location,
949 "Vectorizing an unaligned access.");
951 return true;
954 /* Given an memory reference EXP return whether its alignment is less
955 than its size. */
957 static bool
958 not_size_aligned (tree exp)
960 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
961 return true;
963 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
964 > get_object_alignment (exp));
967 /* Function vector_alignment_reachable_p
969 Return true if vector alignment for DR is reachable by peeling
970 a few loop iterations. Return false otherwise. */
972 static bool
973 vector_alignment_reachable_p (struct data_reference *dr)
975 gimple stmt = DR_STMT (dr);
976 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
981 /* For interleaved access we peel only if number of iterations in
982 the prolog loop ({VF - misalignment}), is a multiple of the
983 number of the interleaved accesses. */
984 int elem_size, mis_in_elements;
985 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
987 /* FORNOW: handle only known alignment. */
988 if (!known_alignment_for_access_p (dr))
989 return false;
991 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
992 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
994 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
995 return false;
998 /* If misalignment is known at the compile time then allow peeling
999 only if natural alignment is reachable through peeling. */
1000 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1002 HOST_WIDE_INT elmsize =
1003 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_NOTE, vect_location,
1007 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1008 dump_printf (MSG_NOTE,
1009 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1011 if (DR_MISALIGNMENT (dr) % elmsize)
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "data size does not divide the misalignment.\n");
1016 return false;
1020 if (!known_alignment_for_access_p (dr))
1022 tree type = TREE_TYPE (DR_REF (dr));
1023 bool is_packed = not_size_aligned (DR_REF (dr));
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Unknown misalignment, is_packed = %d",is_packed);
1027 if ((TYPE_USER_ALIGN (type) && !is_packed)
1028 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1029 return true;
1030 else
1031 return false;
1034 return true;
1038 /* Calculate the cost of the memory access represented by DR. */
1040 static void
1041 vect_get_data_access_cost (struct data_reference *dr,
1042 unsigned int *inside_cost,
1043 unsigned int *outside_cost,
1044 stmt_vector_for_cost *body_cost_vec)
1046 gimple stmt = DR_STMT (dr);
1047 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1048 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1049 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1050 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1051 int ncopies = vf / nunits;
1053 if (DR_IS_READ (dr))
1054 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1055 NULL, body_cost_vec, false);
1056 else
1057 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_NOTE, vect_location,
1061 "vect_get_data_access_cost: inside_cost = %d, "
1062 "outside_cost = %d.", *inside_cost, *outside_cost);
1066 /* Insert DR into peeling hash table with NPEEL as key. */
1068 static void
1069 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1070 int npeel)
1072 struct _vect_peel_info elem, *slot;
1073 _vect_peel_info **new_slot;
1074 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1076 elem.npeel = npeel;
1077 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1078 if (slot)
1079 slot->count++;
1080 else
1082 slot = XNEW (struct _vect_peel_info);
1083 slot->npeel = npeel;
1084 slot->dr = dr;
1085 slot->count = 1;
1086 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1087 *new_slot = slot;
1090 if (!supportable_dr_alignment && !flag_vect_cost_model)
1091 slot->count += VECT_MAX_COST;
1095 /* Traverse peeling hash table to find peeling option that aligns maximum
1096 number of data accesses. */
1099 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1100 _vect_peel_extended_info *max)
1102 vect_peel_info elem = *slot;
1104 if (elem->count > max->peel_info.count
1105 || (elem->count == max->peel_info.count
1106 && max->peel_info.npeel > elem->npeel))
1108 max->peel_info.npeel = elem->npeel;
1109 max->peel_info.count = elem->count;
1110 max->peel_info.dr = elem->dr;
1113 return 1;
1117 /* Traverse peeling hash table and calculate cost for each peeling option.
1118 Find the one with the lowest cost. */
1121 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1122 _vect_peel_extended_info *min)
1124 vect_peel_info elem = *slot;
1125 int save_misalignment, dummy;
1126 unsigned int inside_cost = 0, outside_cost = 0, i;
1127 gimple stmt = DR_STMT (elem->dr);
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1129 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1130 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1131 struct data_reference *dr;
1132 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1133 int single_iter_cost;
1135 prologue_cost_vec.create (2);
1136 body_cost_vec.create (2);
1137 epilogue_cost_vec.create (2);
1139 FOR_EACH_VEC_ELT (datarefs, i, dr)
1141 stmt = DR_STMT (dr);
1142 stmt_info = vinfo_for_stmt (stmt);
1143 /* For interleaving, only the alignment of the first access
1144 matters. */
1145 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1146 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1147 continue;
1149 save_misalignment = DR_MISALIGNMENT (dr);
1150 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1151 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1152 &body_cost_vec);
1153 SET_DR_MISALIGNMENT (dr, save_misalignment);
1156 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1157 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1158 &dummy, single_iter_cost,
1159 &prologue_cost_vec,
1160 &epilogue_cost_vec);
1162 /* Prologue and epilogue costs are added to the target model later.
1163 These costs depend only on the scalar iteration cost, the
1164 number of peeling iterations finally chosen, and the number of
1165 misaligned statements. So discard the information found here. */
1166 prologue_cost_vec.release ();
1167 epilogue_cost_vec.release ();
1169 if (inside_cost < min->inside_cost
1170 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1172 min->inside_cost = inside_cost;
1173 min->outside_cost = outside_cost;
1174 min->body_cost_vec.release ();
1175 min->body_cost_vec = body_cost_vec;
1176 min->peel_info.dr = elem->dr;
1177 min->peel_info.npeel = elem->npeel;
1179 else
1180 body_cost_vec.release ();
1182 return 1;
1186 /* Choose best peeling option by traversing peeling hash table and either
1187 choosing an option with the lowest cost (if cost model is enabled) or the
1188 option that aligns as many accesses as possible. */
1190 static struct data_reference *
1191 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1192 unsigned int *npeel,
1193 stmt_vector_for_cost *body_cost_vec)
1195 struct _vect_peel_extended_info res;
1197 res.peel_info.dr = NULL;
1198 res.body_cost_vec = stmt_vector_for_cost();
1200 if (flag_vect_cost_model)
1202 res.inside_cost = INT_MAX;
1203 res.outside_cost = INT_MAX;
1204 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1205 .traverse <_vect_peel_extended_info *,
1206 vect_peeling_hash_get_lowest_cost> (&res);
1208 else
1210 res.peel_info.count = 0;
1211 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1212 .traverse <_vect_peel_extended_info *,
1213 vect_peeling_hash_get_most_frequent> (&res);
1216 *npeel = res.peel_info.npeel;
1217 *body_cost_vec = res.body_cost_vec;
1218 return res.peel_info.dr;
1222 /* Function vect_enhance_data_refs_alignment
1224 This pass will use loop versioning and loop peeling in order to enhance
1225 the alignment of data references in the loop.
1227 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1228 original loop is to be vectorized. Any other loops that are created by
1229 the transformations performed in this pass - are not supposed to be
1230 vectorized. This restriction will be relaxed.
1232 This pass will require a cost model to guide it whether to apply peeling
1233 or versioning or a combination of the two. For example, the scheme that
1234 intel uses when given a loop with several memory accesses, is as follows:
1235 choose one memory access ('p') which alignment you want to force by doing
1236 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1237 other accesses are not necessarily aligned, or (2) use loop versioning to
1238 generate one loop in which all accesses are aligned, and another loop in
1239 which only 'p' is necessarily aligned.
1241 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1242 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1243 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1245 Devising a cost model is the most critical aspect of this work. It will
1246 guide us on which access to peel for, whether to use loop versioning, how
1247 many versions to create, etc. The cost model will probably consist of
1248 generic considerations as well as target specific considerations (on
1249 powerpc for example, misaligned stores are more painful than misaligned
1250 loads).
1252 Here are the general steps involved in alignment enhancements:
1254 -- original loop, before alignment analysis:
1255 for (i=0; i<N; i++){
1256 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1257 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1260 -- After vect_compute_data_refs_alignment:
1261 for (i=0; i<N; i++){
1262 x = q[i]; # DR_MISALIGNMENT(q) = 3
1263 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1266 -- Possibility 1: we do loop versioning:
1267 if (p is aligned) {
1268 for (i=0; i<N; i++){ # loop 1A
1269 x = q[i]; # DR_MISALIGNMENT(q) = 3
1270 p[i] = y; # DR_MISALIGNMENT(p) = 0
1273 else {
1274 for (i=0; i<N; i++){ # loop 1B
1275 x = q[i]; # DR_MISALIGNMENT(q) = 3
1276 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1280 -- Possibility 2: we do loop peeling:
1281 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1282 x = q[i];
1283 p[i] = y;
1285 for (i = 3; i < N; i++){ # loop 2A
1286 x = q[i]; # DR_MISALIGNMENT(q) = 0
1287 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1290 -- Possibility 3: combination of loop peeling and versioning:
1291 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1292 x = q[i];
1293 p[i] = y;
1295 if (p is aligned) {
1296 for (i = 3; i<N; i++){ # loop 3A
1297 x = q[i]; # DR_MISALIGNMENT(q) = 0
1298 p[i] = y; # DR_MISALIGNMENT(p) = 0
1301 else {
1302 for (i = 3; i<N; i++){ # loop 3B
1303 x = q[i]; # DR_MISALIGNMENT(q) = 0
1304 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1308 These loops are later passed to loop_transform to be vectorized. The
1309 vectorizer will use the alignment information to guide the transformation
1310 (whether to generate regular loads/stores, or with special handling for
1311 misalignment). */
1313 bool
1314 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1316 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1317 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1318 enum dr_alignment_support supportable_dr_alignment;
1319 struct data_reference *dr0 = NULL, *first_store = NULL;
1320 struct data_reference *dr;
1321 unsigned int i, j;
1322 bool do_peeling = false;
1323 bool do_versioning = false;
1324 bool stat;
1325 gimple stmt;
1326 stmt_vec_info stmt_info;
1327 unsigned int npeel = 0;
1328 bool all_misalignments_unknown = true;
1329 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1330 unsigned possible_npeel_number = 1;
1331 tree vectype;
1332 unsigned int nelements, mis, same_align_drs_max = 0;
1333 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1335 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_NOTE, vect_location,
1337 "=== vect_enhance_data_refs_alignment ===");
1339 /* While cost model enhancements are expected in the future, the high level
1340 view of the code at this time is as follows:
1342 A) If there is a misaligned access then see if peeling to align
1343 this access can make all data references satisfy
1344 vect_supportable_dr_alignment. If so, update data structures
1345 as needed and return true.
1347 B) If peeling wasn't possible and there is a data reference with an
1348 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1349 then see if loop versioning checks can be used to make all data
1350 references satisfy vect_supportable_dr_alignment. If so, update
1351 data structures as needed and return true.
1353 C) If neither peeling nor versioning were successful then return false if
1354 any data reference does not satisfy vect_supportable_dr_alignment.
1356 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1358 Note, Possibility 3 above (which is peeling and versioning together) is not
1359 being done at this time. */
1361 /* (1) Peeling to force alignment. */
1363 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1364 Considerations:
1365 + How many accesses will become aligned due to the peeling
1366 - How many accesses will become unaligned due to the peeling,
1367 and the cost of misaligned accesses.
1368 - The cost of peeling (the extra runtime checks, the increase
1369 in code size). */
1371 FOR_EACH_VEC_ELT (datarefs, i, dr)
1373 stmt = DR_STMT (dr);
1374 stmt_info = vinfo_for_stmt (stmt);
1376 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1377 continue;
1379 /* For interleaving, only the alignment of the first access
1380 matters. */
1381 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1382 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1383 continue;
1385 /* For invariant accesses there is nothing to enhance. */
1386 if (integer_zerop (DR_STEP (dr)))
1387 continue;
1389 /* Strided loads perform only component accesses, alignment is
1390 irrelevant for them. */
1391 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1392 continue;
1394 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1395 do_peeling = vector_alignment_reachable_p (dr);
1396 if (do_peeling)
1398 if (known_alignment_for_access_p (dr))
1400 unsigned int npeel_tmp;
1401 bool negative = tree_int_cst_compare (DR_STEP (dr),
1402 size_zero_node) < 0;
1404 /* Save info about DR in the hash table. */
1405 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1406 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1408 vectype = STMT_VINFO_VECTYPE (stmt_info);
1409 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1410 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1411 TREE_TYPE (DR_REF (dr))));
1412 npeel_tmp = (negative
1413 ? (mis - nelements) : (nelements - mis))
1414 & (nelements - 1);
1416 /* For multiple types, it is possible that the bigger type access
1417 will have more than one peeling option. E.g., a loop with two
1418 types: one of size (vector size / 4), and the other one of
1419 size (vector size / 8). Vectorization factor will 8. If both
1420 access are misaligned by 3, the first one needs one scalar
1421 iteration to be aligned, and the second one needs 5. But the
1422 the first one will be aligned also by peeling 5 scalar
1423 iterations, and in that case both accesses will be aligned.
1424 Hence, except for the immediate peeling amount, we also want
1425 to try to add full vector size, while we don't exceed
1426 vectorization factor.
1427 We do this automtically for cost model, since we calculate cost
1428 for every peeling option. */
1429 if (!flag_vect_cost_model)
1430 possible_npeel_number = vf /nelements;
1432 /* Handle the aligned case. We may decide to align some other
1433 access, making DR unaligned. */
1434 if (DR_MISALIGNMENT (dr) == 0)
1436 npeel_tmp = 0;
1437 if (!flag_vect_cost_model)
1438 possible_npeel_number++;
1441 for (j = 0; j < possible_npeel_number; j++)
1443 gcc_assert (npeel_tmp <= vf);
1444 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1445 npeel_tmp += nelements;
1448 all_misalignments_unknown = false;
1449 /* Data-ref that was chosen for the case that all the
1450 misalignments are unknown is not relevant anymore, since we
1451 have a data-ref with known alignment. */
1452 dr0 = NULL;
1454 else
1456 /* If we don't know any misalignment values, we prefer
1457 peeling for data-ref that has the maximum number of data-refs
1458 with the same alignment, unless the target prefers to align
1459 stores over load. */
1460 if (all_misalignments_unknown)
1462 unsigned same_align_drs
1463 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1464 if (!dr0
1465 || same_align_drs_max < same_align_drs)
1467 same_align_drs_max = same_align_drs;
1468 dr0 = dr;
1470 /* For data-refs with the same number of related
1471 accesses prefer the one where the misalign
1472 computation will be invariant in the outermost loop. */
1473 else if (same_align_drs_max == same_align_drs)
1475 struct loop *ivloop0, *ivloop;
1476 ivloop0 = outermost_invariant_loop_for_expr
1477 (loop, DR_BASE_ADDRESS (dr0));
1478 ivloop = outermost_invariant_loop_for_expr
1479 (loop, DR_BASE_ADDRESS (dr));
1480 if ((ivloop && !ivloop0)
1481 || (ivloop && ivloop0
1482 && flow_loop_nested_p (ivloop, ivloop0)))
1483 dr0 = dr;
1486 if (!first_store && DR_IS_WRITE (dr))
1487 first_store = dr;
1490 /* If there are both known and unknown misaligned accesses in the
1491 loop, we choose peeling amount according to the known
1492 accesses. */
1493 if (!supportable_dr_alignment)
1495 dr0 = dr;
1496 if (!first_store && DR_IS_WRITE (dr))
1497 first_store = dr;
1501 else
1503 if (!aligned_access_p (dr))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "vector alignment may not be reachable");
1508 break;
1513 /* Check if we can possibly peel the loop. */
1514 if (!vect_can_advance_ivs_p (loop_vinfo)
1515 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1516 do_peeling = false;
1518 if (do_peeling && all_misalignments_unknown
1519 && vect_supportable_dr_alignment (dr0, false))
1522 /* Check if the target requires to prefer stores over loads, i.e., if
1523 misaligned stores are more expensive than misaligned loads (taking
1524 drs with same alignment into account). */
1525 if (first_store && DR_IS_READ (dr0))
1527 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1528 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1529 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1530 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1531 stmt_vector_for_cost dummy;
1532 dummy.create (2);
1534 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1535 &dummy);
1536 vect_get_data_access_cost (first_store, &store_inside_cost,
1537 &store_outside_cost, &dummy);
1539 dummy.release ();
1541 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1542 aligning the load DR0). */
1543 load_inside_penalty = store_inside_cost;
1544 load_outside_penalty = store_outside_cost;
1545 for (i = 0;
1546 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1547 DR_STMT (first_store))).iterate (i, &dr);
1548 i++)
1549 if (DR_IS_READ (dr))
1551 load_inside_penalty += load_inside_cost;
1552 load_outside_penalty += load_outside_cost;
1554 else
1556 load_inside_penalty += store_inside_cost;
1557 load_outside_penalty += store_outside_cost;
1560 /* Calculate the penalty for leaving DR0 unaligned (by
1561 aligning the FIRST_STORE). */
1562 store_inside_penalty = load_inside_cost;
1563 store_outside_penalty = load_outside_cost;
1564 for (i = 0;
1565 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1566 DR_STMT (dr0))).iterate (i, &dr);
1567 i++)
1568 if (DR_IS_READ (dr))
1570 store_inside_penalty += load_inside_cost;
1571 store_outside_penalty += load_outside_cost;
1573 else
1575 store_inside_penalty += store_inside_cost;
1576 store_outside_penalty += store_outside_cost;
1579 if (load_inside_penalty > store_inside_penalty
1580 || (load_inside_penalty == store_inside_penalty
1581 && load_outside_penalty > store_outside_penalty))
1582 dr0 = first_store;
1585 /* In case there are only loads with different unknown misalignments, use
1586 peeling only if it may help to align other accesses in the loop. */
1587 if (!first_store
1588 && !STMT_VINFO_SAME_ALIGN_REFS (
1589 vinfo_for_stmt (DR_STMT (dr0))).length ()
1590 && vect_supportable_dr_alignment (dr0, false)
1591 != dr_unaligned_supported)
1592 do_peeling = false;
1595 if (do_peeling && !dr0)
1597 /* Peeling is possible, but there is no data access that is not supported
1598 unless aligned. So we try to choose the best possible peeling. */
1600 /* We should get here only if there are drs with known misalignment. */
1601 gcc_assert (!all_misalignments_unknown);
1603 /* Choose the best peeling from the hash table. */
1604 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1605 &body_cost_vec);
1606 if (!dr0 || !npeel)
1607 do_peeling = false;
1610 if (do_peeling)
1612 stmt = DR_STMT (dr0);
1613 stmt_info = vinfo_for_stmt (stmt);
1614 vectype = STMT_VINFO_VECTYPE (stmt_info);
1615 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1617 if (known_alignment_for_access_p (dr0))
1619 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1620 size_zero_node) < 0;
1621 if (!npeel)
1623 /* Since it's known at compile time, compute the number of
1624 iterations in the peeled loop (the peeling factor) for use in
1625 updating DR_MISALIGNMENT values. The peeling factor is the
1626 vectorization factor minus the misalignment as an element
1627 count. */
1628 mis = DR_MISALIGNMENT (dr0);
1629 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1630 npeel = ((negative ? mis - nelements : nelements - mis)
1631 & (nelements - 1));
1634 /* For interleaved data access every iteration accesses all the
1635 members of the group, therefore we divide the number of iterations
1636 by the group size. */
1637 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1638 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1639 npeel /= GROUP_SIZE (stmt_info);
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "Try peeling by %d", npeel);
1646 /* Ensure that all data refs can be vectorized after the peel. */
1647 FOR_EACH_VEC_ELT (datarefs, i, dr)
1649 int save_misalignment;
1651 if (dr == dr0)
1652 continue;
1654 stmt = DR_STMT (dr);
1655 stmt_info = vinfo_for_stmt (stmt);
1656 /* For interleaving, only the alignment of the first access
1657 matters. */
1658 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1659 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1660 continue;
1662 /* Strided loads perform only component accesses, alignment is
1663 irrelevant for them. */
1664 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1665 continue;
1667 save_misalignment = DR_MISALIGNMENT (dr);
1668 vect_update_misalignment_for_peel (dr, dr0, npeel);
1669 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1670 SET_DR_MISALIGNMENT (dr, save_misalignment);
1672 if (!supportable_dr_alignment)
1674 do_peeling = false;
1675 break;
1679 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1681 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1682 if (!stat)
1683 do_peeling = false;
1684 else
1686 body_cost_vec.release ();
1687 return stat;
1691 if (do_peeling)
1693 stmt_info_for_cost *si;
1694 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1696 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1697 If the misalignment of DR_i is identical to that of dr0 then set
1698 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1699 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1700 by the peeling factor times the element size of DR_i (MOD the
1701 vectorization factor times the size). Otherwise, the
1702 misalignment of DR_i must be set to unknown. */
1703 FOR_EACH_VEC_ELT (datarefs, i, dr)
1704 if (dr != dr0)
1705 vect_update_misalignment_for_peel (dr, dr0, npeel);
1707 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1708 if (npeel)
1709 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1710 else
1711 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1712 SET_DR_MISALIGNMENT (dr0, 0);
1713 if (dump_enabled_p ())
1715 dump_printf_loc (MSG_NOTE, vect_location,
1716 "Alignment of access forced using peeling.");
1717 dump_printf_loc (MSG_NOTE, vect_location,
1718 "Peeling for alignment will be applied.");
1720 /* We've delayed passing the inside-loop peeling costs to the
1721 target cost model until we were sure peeling would happen.
1722 Do so now. */
1723 if (body_cost_vec.exists ())
1725 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1727 struct _stmt_vec_info *stmt_info
1728 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1729 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1730 si->misalign, vect_body);
1732 body_cost_vec.release ();
1735 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1736 gcc_assert (stat);
1737 return stat;
1741 body_cost_vec.release ();
1743 /* (2) Versioning to force alignment. */
1745 /* Try versioning if:
1746 1) flag_tree_vect_loop_version is TRUE
1747 2) optimize loop for speed
1748 3) there is at least one unsupported misaligned data ref with an unknown
1749 misalignment, and
1750 4) all misaligned data refs with a known misalignment are supported, and
1751 5) the number of runtime alignment checks is within reason. */
1753 do_versioning =
1754 flag_tree_vect_loop_version
1755 && optimize_loop_nest_for_speed_p (loop)
1756 && (!loop->inner); /* FORNOW */
1758 if (do_versioning)
1760 FOR_EACH_VEC_ELT (datarefs, i, dr)
1762 stmt = DR_STMT (dr);
1763 stmt_info = vinfo_for_stmt (stmt);
1765 /* For interleaving, only the alignment of the first access
1766 matters. */
1767 if (aligned_access_p (dr)
1768 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1769 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1770 continue;
1772 /* Strided loads perform only component accesses, alignment is
1773 irrelevant for them. */
1774 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1775 continue;
1777 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1779 if (!supportable_dr_alignment)
1781 gimple stmt;
1782 int mask;
1783 tree vectype;
1785 if (known_alignment_for_access_p (dr)
1786 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1787 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1789 do_versioning = false;
1790 break;
1793 stmt = DR_STMT (dr);
1794 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1795 gcc_assert (vectype);
1797 /* The rightmost bits of an aligned address must be zeros.
1798 Construct the mask needed for this test. For example,
1799 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1800 mask must be 15 = 0xf. */
1801 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1803 /* FORNOW: use the same mask to test all potentially unaligned
1804 references in the loop. The vectorizer currently supports
1805 a single vector size, see the reference to
1806 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1807 vectorization factor is computed. */
1808 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1809 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1810 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1811 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1812 DR_STMT (dr));
1816 /* Versioning requires at least one misaligned data reference. */
1817 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1818 do_versioning = false;
1819 else if (!do_versioning)
1820 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1823 if (do_versioning)
1825 vec<gimple> may_misalign_stmts
1826 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1827 gimple stmt;
1829 /* It can now be assumed that the data references in the statements
1830 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1831 of the loop being vectorized. */
1832 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1834 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1835 dr = STMT_VINFO_DATA_REF (stmt_info);
1836 SET_DR_MISALIGNMENT (dr, 0);
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_NOTE, vect_location,
1839 "Alignment of access forced using versioning.");
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_NOTE, vect_location,
1844 "Versioning for alignment will be applied.");
1846 /* Peeling and versioning can't be done together at this time. */
1847 gcc_assert (! (do_peeling && do_versioning));
1849 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1850 gcc_assert (stat);
1851 return stat;
1854 /* This point is reached if neither peeling nor versioning is being done. */
1855 gcc_assert (! (do_peeling || do_versioning));
1857 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1858 return stat;
1862 /* Function vect_find_same_alignment_drs.
1864 Update group and alignment relations according to the chosen
1865 vectorization factor. */
1867 static void
1868 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1869 loop_vec_info loop_vinfo)
1871 unsigned int i;
1872 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1873 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1874 struct data_reference *dra = DDR_A (ddr);
1875 struct data_reference *drb = DDR_B (ddr);
1876 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1877 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1878 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1879 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1880 lambda_vector dist_v;
1881 unsigned int loop_depth;
1883 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1884 return;
1886 if (dra == drb)
1887 return;
1889 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1890 return;
1892 /* Loop-based vectorization and known data dependence. */
1893 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1894 return;
1896 /* Data-dependence analysis reports a distance vector of zero
1897 for data-references that overlap only in the first iteration
1898 but have different sign step (see PR45764).
1899 So as a sanity check require equal DR_STEP. */
1900 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1901 return;
1903 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1904 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1906 int dist = dist_v[loop_depth];
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_NOTE, vect_location,
1910 "dependence distance = %d.", dist);
1912 /* Same loop iteration. */
1913 if (dist == 0
1914 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1916 /* Two references with distance zero have the same alignment. */
1917 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1918 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1919 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location,
1922 "accesses have the same alignment.");
1923 dump_printf (MSG_NOTE,
1924 "dependence distance modulo vf == 0 between ");
1925 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1926 dump_printf (MSG_NOTE, " and ");
1927 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1934 /* Function vect_analyze_data_refs_alignment
1936 Analyze the alignment of the data-references in the loop.
1937 Return FALSE if a data reference is found that cannot be vectorized. */
1939 bool
1940 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1941 bb_vec_info bb_vinfo)
1943 if (dump_enabled_p ())
1944 dump_printf_loc (MSG_NOTE, vect_location,
1945 "=== vect_analyze_data_refs_alignment ===");
1947 /* Mark groups of data references with same alignment using
1948 data dependence information. */
1949 if (loop_vinfo)
1951 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1952 struct data_dependence_relation *ddr;
1953 unsigned int i;
1955 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1956 vect_find_same_alignment_drs (ddr, loop_vinfo);
1959 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1961 if (dump_enabled_p ())
1962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1963 "not vectorized: can't calculate alignment "
1964 "for data ref.");
1965 return false;
1968 return true;
1972 /* Analyze groups of accesses: check that DR belongs to a group of
1973 accesses of legal size, step, etc. Detect gaps, single element
1974 interleaving, and other special cases. Set grouped access info.
1975 Collect groups of strided stores for further use in SLP analysis. */
1977 static bool
1978 vect_analyze_group_access (struct data_reference *dr)
1980 tree step = DR_STEP (dr);
1981 tree scalar_type = TREE_TYPE (DR_REF (dr));
1982 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1983 gimple stmt = DR_STMT (dr);
1984 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1985 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1986 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1987 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1988 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1989 bool slp_impossible = false;
1990 struct loop *loop = NULL;
1992 if (loop_vinfo)
1993 loop = LOOP_VINFO_LOOP (loop_vinfo);
1995 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
1996 size of the interleaving group (including gaps). */
1997 groupsize = absu_hwi (dr_step) / type_size;
1999 /* Not consecutive access is possible only if it is a part of interleaving. */
2000 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2002 /* Check if it this DR is a part of interleaving, and is a single
2003 element of the group that is accessed in the loop. */
2005 /* Gaps are supported only for loads. STEP must be a multiple of the type
2006 size. The size of the group must be a power of 2. */
2007 if (DR_IS_READ (dr)
2008 && (dr_step % type_size) == 0
2009 && groupsize > 0
2010 && exact_log2 (groupsize) != -1)
2012 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2013 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2014 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_NOTE, vect_location,
2017 "Detected single element interleaving ");
2018 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2019 dump_printf (MSG_NOTE, " step ");
2020 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2023 if (loop_vinfo)
2025 if (dump_enabled_p ())
2026 dump_printf_loc (MSG_NOTE, vect_location,
2027 "Data access with gaps requires scalar "
2028 "epilogue loop");
2029 if (loop->inner)
2031 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033 "Peeling for outer loop is not"
2034 " supported");
2035 return false;
2038 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2041 return true;
2044 if (dump_enabled_p ())
2046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047 "not consecutive access ");
2048 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2051 if (bb_vinfo)
2053 /* Mark the statement as unvectorizable. */
2054 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2055 return true;
2058 return false;
2061 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2063 /* First stmt in the interleaving chain. Check the chain. */
2064 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2065 struct data_reference *data_ref = dr;
2066 unsigned int count = 1;
2067 tree prev_init = DR_INIT (data_ref);
2068 gimple prev = stmt;
2069 HOST_WIDE_INT diff, gaps = 0;
2070 unsigned HOST_WIDE_INT count_in_bytes;
2072 while (next)
2074 /* Skip same data-refs. In case that two or more stmts share
2075 data-ref (supported only for loads), we vectorize only the first
2076 stmt, and the rest get their vectorized loads from the first
2077 one. */
2078 if (!tree_int_cst_compare (DR_INIT (data_ref),
2079 DR_INIT (STMT_VINFO_DATA_REF (
2080 vinfo_for_stmt (next)))))
2082 if (DR_IS_WRITE (data_ref))
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2086 "Two store stmts share the same dr.");
2087 return false;
2090 /* For load use the same data-ref load. */
2091 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2093 prev = next;
2094 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2095 continue;
2098 prev = next;
2099 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2101 /* All group members have the same STEP by construction. */
2102 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2104 /* Check that the distance between two accesses is equal to the type
2105 size. Otherwise, we have gaps. */
2106 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2107 - TREE_INT_CST_LOW (prev_init)) / type_size;
2108 if (diff != 1)
2110 /* FORNOW: SLP of accesses with gaps is not supported. */
2111 slp_impossible = true;
2112 if (DR_IS_WRITE (data_ref))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "interleaved store with gaps");
2117 return false;
2120 gaps += diff - 1;
2123 last_accessed_element += diff;
2125 /* Store the gap from the previous member of the group. If there is no
2126 gap in the access, GROUP_GAP is always 1. */
2127 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2129 prev_init = DR_INIT (data_ref);
2130 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2131 /* Count the number of data-refs in the chain. */
2132 count++;
2135 /* COUNT is the number of accesses found, we multiply it by the size of
2136 the type to get COUNT_IN_BYTES. */
2137 count_in_bytes = type_size * count;
2139 /* Check that the size of the interleaving (including gaps) is not
2140 greater than STEP. */
2141 if (dr_step != 0
2142 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2144 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "interleaving size is greater than step for ");
2148 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2150 return false;
2153 /* Check that the size of the interleaving is equal to STEP for stores,
2154 i.e., that there are no gaps. */
2155 if (dr_step != 0
2156 && absu_hwi (dr_step) != count_in_bytes)
2158 if (DR_IS_READ (dr))
2160 slp_impossible = true;
2161 /* There is a gap after the last load in the group. This gap is a
2162 difference between the groupsize and the number of elements.
2163 When there is no gap, this difference should be 0. */
2164 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2166 else
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170 "interleaved store with gaps");
2171 return false;
2175 /* Check that STEP is a multiple of type size. */
2176 if (dr_step != 0
2177 && (dr_step % type_size) != 0)
2179 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "step is not a multiple of type size: step ");
2183 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2184 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2186 TYPE_SIZE_UNIT (scalar_type));
2188 return false;
2191 if (groupsize == 0)
2192 groupsize = count;
2194 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2195 if (dump_enabled_p ())
2196 dump_printf_loc (MSG_NOTE, vect_location,
2197 "Detected interleaving of size %d", (int)groupsize);
2199 /* SLP: create an SLP data structure for every interleaving group of
2200 stores for further analysis in vect_analyse_slp. */
2201 if (DR_IS_WRITE (dr) && !slp_impossible)
2203 if (loop_vinfo)
2204 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2205 if (bb_vinfo)
2206 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2209 /* There is a gap in the end of the group. */
2210 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 "Data access with gaps requires scalar "
2215 "epilogue loop");
2216 if (loop->inner)
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "Peeling for outer loop is not supported");
2221 return false;
2224 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2228 return true;
2232 /* Analyze the access pattern of the data-reference DR.
2233 In case of non-consecutive accesses call vect_analyze_group_access() to
2234 analyze groups of accesses. */
2236 static bool
2237 vect_analyze_data_ref_access (struct data_reference *dr)
2239 tree step = DR_STEP (dr);
2240 tree scalar_type = TREE_TYPE (DR_REF (dr));
2241 gimple stmt = DR_STMT (dr);
2242 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2243 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2244 struct loop *loop = NULL;
2246 if (loop_vinfo)
2247 loop = LOOP_VINFO_LOOP (loop_vinfo);
2249 if (loop_vinfo && !step)
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253 "bad data-ref access in loop");
2254 return false;
2257 /* Allow invariant loads in loops. */
2258 if (loop_vinfo && integer_zerop (step))
2260 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2261 return DR_IS_READ (dr);
2264 if (loop && nested_in_vect_loop_p (loop, stmt))
2266 /* Interleaved accesses are not yet supported within outer-loop
2267 vectorization for references in the inner-loop. */
2268 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2270 /* For the rest of the analysis we use the outer-loop step. */
2271 step = STMT_VINFO_DR_STEP (stmt_info);
2272 if (integer_zerop (step))
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_NOTE, vect_location,
2276 "zero step in outer loop.");
2277 if (DR_IS_READ (dr))
2278 return true;
2279 else
2280 return false;
2284 /* Consecutive? */
2285 if (TREE_CODE (step) == INTEGER_CST)
2287 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2288 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2289 || (dr_step < 0
2290 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2292 /* Mark that it is not interleaving. */
2293 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2294 return true;
2298 if (loop && nested_in_vect_loop_p (loop, stmt))
2300 if (dump_enabled_p ())
2301 dump_printf_loc (MSG_NOTE, vect_location,
2302 "grouped access in outer loop.");
2303 return false;
2306 /* Assume this is a DR handled by non-constant strided load case. */
2307 if (TREE_CODE (step) != INTEGER_CST)
2308 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2310 /* Not consecutive access - check if it's a part of interleaving group. */
2311 return vect_analyze_group_access (dr);
2314 /* Compare two data-references DRA and DRB to group them into chunks
2315 suitable for grouping. */
2317 static int
2318 dr_group_sort_cmp (const void *dra_, const void *drb_)
2320 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2321 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2322 hashval_t h1, h2;
2323 int cmp;
2325 /* Stabilize sort. */
2326 if (dra == drb)
2327 return 0;
2329 /* Ordering of DRs according to base. */
2330 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2332 h1 = iterative_hash_expr (DR_BASE_ADDRESS (dra), 0);
2333 h2 = iterative_hash_expr (DR_BASE_ADDRESS (drb), 0);
2334 if (h1 != h2)
2335 return h1 < h2 ? -1 : 1;
2338 /* And according to DR_OFFSET. */
2339 if (!dr_equal_offsets_p (dra, drb))
2341 h1 = iterative_hash_expr (DR_OFFSET (dra), 0);
2342 h2 = iterative_hash_expr (DR_OFFSET (drb), 0);
2343 if (h1 != h2)
2344 return h1 < h2 ? -1 : 1;
2347 /* Put reads before writes. */
2348 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2349 return DR_IS_READ (dra) ? -1 : 1;
2351 /* Then sort after access size. */
2352 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2353 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2355 h1 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 0);
2356 h2 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0);
2357 if (h1 != h2)
2358 return h1 < h2 ? -1 : 1;
2361 /* And after step. */
2362 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2364 h1 = iterative_hash_expr (DR_STEP (dra), 0);
2365 h2 = iterative_hash_expr (DR_STEP (drb), 0);
2366 if (h1 != h2)
2367 return h1 < h2 ? -1 : 1;
2370 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2371 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2372 if (cmp == 0)
2373 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2374 return cmp;
2377 /* Function vect_analyze_data_ref_accesses.
2379 Analyze the access pattern of all the data references in the loop.
2381 FORNOW: the only access pattern that is considered vectorizable is a
2382 simple step 1 (consecutive) access.
2384 FORNOW: handle only arrays and pointer accesses. */
2386 bool
2387 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2389 unsigned int i;
2390 vec<data_reference_p> datarefs;
2391 struct data_reference *dr;
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_NOTE, vect_location,
2395 "=== vect_analyze_data_ref_accesses ===");
2397 if (loop_vinfo)
2398 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2399 else
2400 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2402 if (datarefs.is_empty ())
2403 return true;
2405 /* Sort the array of datarefs to make building the interleaving chains
2406 linear. */
2407 qsort (datarefs.address(), datarefs.length (),
2408 sizeof (data_reference_p), dr_group_sort_cmp);
2410 /* Build the interleaving chains. */
2411 for (i = 0; i < datarefs.length () - 1;)
2413 data_reference_p dra = datarefs[i];
2414 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2415 stmt_vec_info lastinfo = NULL;
2416 for (i = i + 1; i < datarefs.length (); ++i)
2418 data_reference_p drb = datarefs[i];
2419 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2421 /* ??? Imperfect sorting (non-compatible types, non-modulo
2422 accesses, same accesses) can lead to a group to be artificially
2423 split here as we don't just skip over those. If it really
2424 matters we can push those to a worklist and re-iterate
2425 over them. The we can just skip ahead to the next DR here. */
2427 /* Check that the data-refs have same first location (except init)
2428 and they are both either store or load (not load and store). */
2429 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2430 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2431 DR_BASE_ADDRESS (drb), 0)
2432 || !dr_equal_offsets_p (dra, drb))
2433 break;
2435 /* Check that the data-refs have the same constant size and step. */
2436 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2437 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2438 if (!host_integerp (sza, 1)
2439 || !host_integerp (szb, 1)
2440 || !tree_int_cst_equal (sza, szb)
2441 || !host_integerp (DR_STEP (dra), 0)
2442 || !host_integerp (DR_STEP (drb), 0)
2443 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2444 break;
2446 /* Do not place the same access in the interleaving chain twice. */
2447 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2448 break;
2450 /* Check the types are compatible.
2451 ??? We don't distinguish this during sorting. */
2452 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2453 TREE_TYPE (DR_REF (drb))))
2454 break;
2456 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2457 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2458 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2459 gcc_assert (init_a < init_b);
2461 /* If init_b == init_a + the size of the type * k, we have an
2462 interleaving, and DRA is accessed before DRB. */
2463 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2464 if ((init_b - init_a) % type_size_a != 0)
2465 break;
2467 /* The step (if not zero) is greater than the difference between
2468 data-refs' inits. This splits groups into suitable sizes. */
2469 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2470 if (step != 0 && step <= (init_b - init_a))
2471 break;
2473 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE, vect_location,
2476 "Detected interleaving ");
2477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2478 dump_printf (MSG_NOTE, " and ");
2479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2482 /* Link the found element into the group list. */
2483 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2485 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2486 lastinfo = stmtinfo_a;
2488 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2489 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2490 lastinfo = stmtinfo_b;
2494 FOR_EACH_VEC_ELT (datarefs, i, dr)
2495 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2496 && !vect_analyze_data_ref_access (dr))
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2500 "not vectorized: complicated access pattern.");
2502 if (bb_vinfo)
2504 /* Mark the statement as not vectorizable. */
2505 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2506 continue;
2508 else
2509 return false;
2512 return true;
2515 /* Function vect_prune_runtime_alias_test_list.
2517 Prune a list of ddrs to be tested at run-time by versioning for alias.
2518 Return FALSE if resulting list of ddrs is longer then allowed by
2519 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2521 bool
2522 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2524 vec<ddr_p> ddrs =
2525 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2526 unsigned i, j;
2528 if (dump_enabled_p ())
2529 dump_printf_loc (MSG_NOTE, vect_location,
2530 "=== vect_prune_runtime_alias_test_list ===");
2532 for (i = 0; i < ddrs.length (); )
2534 bool found;
2535 ddr_p ddr_i;
2537 ddr_i = ddrs[i];
2538 found = false;
2540 for (j = 0; j < i; j++)
2542 ddr_p ddr_j = ddrs[j];
2544 if (vect_vfa_range_equal (ddr_i, ddr_j))
2546 if (dump_enabled_p ())
2548 dump_printf_loc (MSG_NOTE, vect_location,
2549 "found equal ranges ");
2550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2551 dump_printf (MSG_NOTE, ", ");
2552 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2553 dump_printf (MSG_NOTE, " and ");
2554 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2555 dump_printf (MSG_NOTE, ", ");
2556 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2558 found = true;
2559 break;
2563 if (found)
2565 ddrs.ordered_remove (i);
2566 continue;
2568 i++;
2571 if (ddrs.length () >
2572 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2574 if (dump_enabled_p ())
2576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2577 "disable versioning for alias - max number of "
2578 "generated checks exceeded.");
2581 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2583 return false;
2586 return true;
2589 /* Check whether a non-affine read in stmt is suitable for gather load
2590 and if so, return a builtin decl for that operation. */
2592 tree
2593 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2594 tree *offp, int *scalep)
2596 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2597 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2598 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2599 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2600 tree offtype = NULL_TREE;
2601 tree decl, base, off;
2602 enum machine_mode pmode;
2603 int punsignedp, pvolatilep;
2605 /* The gather builtins need address of the form
2606 loop_invariant + vector * {1, 2, 4, 8}
2608 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2609 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2610 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2611 multiplications and additions in it. To get a vector, we need
2612 a single SSA_NAME that will be defined in the loop and will
2613 contain everything that is not loop invariant and that can be
2614 vectorized. The following code attempts to find such a preexistng
2615 SSA_NAME OFF and put the loop invariants into a tree BASE
2616 that can be gimplified before the loop. */
2617 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2618 &pmode, &punsignedp, &pvolatilep, false);
2619 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2621 if (TREE_CODE (base) == MEM_REF)
2623 if (!integer_zerop (TREE_OPERAND (base, 1)))
2625 if (off == NULL_TREE)
2627 double_int moff = mem_ref_offset (base);
2628 off = double_int_to_tree (sizetype, moff);
2630 else
2631 off = size_binop (PLUS_EXPR, off,
2632 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2634 base = TREE_OPERAND (base, 0);
2636 else
2637 base = build_fold_addr_expr (base);
2639 if (off == NULL_TREE)
2640 off = size_zero_node;
2642 /* If base is not loop invariant, either off is 0, then we start with just
2643 the constant offset in the loop invariant BASE and continue with base
2644 as OFF, otherwise give up.
2645 We could handle that case by gimplifying the addition of base + off
2646 into some SSA_NAME and use that as off, but for now punt. */
2647 if (!expr_invariant_in_loop_p (loop, base))
2649 if (!integer_zerop (off))
2650 return NULL_TREE;
2651 off = base;
2652 base = size_int (pbitpos / BITS_PER_UNIT);
2654 /* Otherwise put base + constant offset into the loop invariant BASE
2655 and continue with OFF. */
2656 else
2658 base = fold_convert (sizetype, base);
2659 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2662 /* OFF at this point may be either a SSA_NAME or some tree expression
2663 from get_inner_reference. Try to peel off loop invariants from it
2664 into BASE as long as possible. */
2665 STRIP_NOPS (off);
2666 while (offtype == NULL_TREE)
2668 enum tree_code code;
2669 tree op0, op1, add = NULL_TREE;
2671 if (TREE_CODE (off) == SSA_NAME)
2673 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2675 if (expr_invariant_in_loop_p (loop, off))
2676 return NULL_TREE;
2678 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2679 break;
2681 op0 = gimple_assign_rhs1 (def_stmt);
2682 code = gimple_assign_rhs_code (def_stmt);
2683 op1 = gimple_assign_rhs2 (def_stmt);
2685 else
2687 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2688 return NULL_TREE;
2689 code = TREE_CODE (off);
2690 extract_ops_from_tree (off, &code, &op0, &op1);
2692 switch (code)
2694 case POINTER_PLUS_EXPR:
2695 case PLUS_EXPR:
2696 if (expr_invariant_in_loop_p (loop, op0))
2698 add = op0;
2699 off = op1;
2700 do_add:
2701 add = fold_convert (sizetype, add);
2702 if (scale != 1)
2703 add = size_binop (MULT_EXPR, add, size_int (scale));
2704 base = size_binop (PLUS_EXPR, base, add);
2705 continue;
2707 if (expr_invariant_in_loop_p (loop, op1))
2709 add = op1;
2710 off = op0;
2711 goto do_add;
2713 break;
2714 case MINUS_EXPR:
2715 if (expr_invariant_in_loop_p (loop, op1))
2717 add = fold_convert (sizetype, op1);
2718 add = size_binop (MINUS_EXPR, size_zero_node, add);
2719 off = op0;
2720 goto do_add;
2722 break;
2723 case MULT_EXPR:
2724 if (scale == 1 && host_integerp (op1, 0))
2726 scale = tree_low_cst (op1, 0);
2727 off = op0;
2728 continue;
2730 break;
2731 case SSA_NAME:
2732 off = op0;
2733 continue;
2734 CASE_CONVERT:
2735 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2736 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2737 break;
2738 if (TYPE_PRECISION (TREE_TYPE (op0))
2739 == TYPE_PRECISION (TREE_TYPE (off)))
2741 off = op0;
2742 continue;
2744 if (TYPE_PRECISION (TREE_TYPE (op0))
2745 < TYPE_PRECISION (TREE_TYPE (off)))
2747 off = op0;
2748 offtype = TREE_TYPE (off);
2749 STRIP_NOPS (off);
2750 continue;
2752 break;
2753 default:
2754 break;
2756 break;
2759 /* If at the end OFF still isn't a SSA_NAME or isn't
2760 defined in the loop, punt. */
2761 if (TREE_CODE (off) != SSA_NAME
2762 || expr_invariant_in_loop_p (loop, off))
2763 return NULL_TREE;
2765 if (offtype == NULL_TREE)
2766 offtype = TREE_TYPE (off);
2768 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2769 offtype, scale);
2770 if (decl == NULL_TREE)
2771 return NULL_TREE;
2773 if (basep)
2774 *basep = base;
2775 if (offp)
2776 *offp = off;
2777 if (scalep)
2778 *scalep = scale;
2779 return decl;
2782 /* Function vect_analyze_data_refs.
2784 Find all the data references in the loop or basic block.
2786 The general structure of the analysis of data refs in the vectorizer is as
2787 follows:
2788 1- vect_analyze_data_refs(loop/bb): call
2789 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2790 in the loop/bb and their dependences.
2791 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2792 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2793 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2797 bool
2798 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2799 bb_vec_info bb_vinfo,
2800 int *min_vf)
2802 struct loop *loop = NULL;
2803 basic_block bb = NULL;
2804 unsigned int i;
2805 vec<data_reference_p> datarefs;
2806 struct data_reference *dr;
2807 tree scalar_type;
2809 if (dump_enabled_p ())
2810 dump_printf_loc (MSG_NOTE, vect_location,
2811 "=== vect_analyze_data_refs ===\n");
2813 if (loop_vinfo)
2815 loop = LOOP_VINFO_LOOP (loop_vinfo);
2816 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2817 || find_data_references_in_loop
2818 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2820 if (dump_enabled_p ())
2821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2822 "not vectorized: loop contains function calls"
2823 " or data references that cannot be analyzed");
2824 return false;
2827 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2829 else
2831 gimple_stmt_iterator gsi;
2833 bb = BB_VINFO_BB (bb_vinfo);
2834 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2836 gimple stmt = gsi_stmt (gsi);
2837 if (!find_data_references_in_stmt (NULL, stmt,
2838 &BB_VINFO_DATAREFS (bb_vinfo)))
2840 /* Mark the rest of the basic-block as unvectorizable. */
2841 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2843 stmt = gsi_stmt (gsi);
2844 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2846 break;
2850 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2853 /* Go through the data-refs, check that the analysis succeeded. Update
2854 pointer from stmt_vec_info struct to DR and vectype. */
2856 FOR_EACH_VEC_ELT (datarefs, i, dr)
2858 gimple stmt;
2859 stmt_vec_info stmt_info;
2860 tree base, offset, init;
2861 bool gather = false;
2862 int vf;
2864 if (!dr || !DR_REF (dr))
2866 if (dump_enabled_p ())
2867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2868 "not vectorized: unhandled data-ref ");
2869 return false;
2872 stmt = DR_STMT (dr);
2873 stmt_info = vinfo_for_stmt (stmt);
2875 /* Check that analysis of the data-ref succeeded. */
2876 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2877 || !DR_STEP (dr))
2879 /* If target supports vector gather loads, see if they can't
2880 be used. */
2881 if (loop_vinfo
2882 && DR_IS_READ (dr)
2883 && !TREE_THIS_VOLATILE (DR_REF (dr))
2884 && targetm.vectorize.builtin_gather != NULL
2885 && !nested_in_vect_loop_p (loop, stmt))
2887 struct data_reference *newdr
2888 = create_data_ref (NULL, loop_containing_stmt (stmt),
2889 DR_REF (dr), stmt, true);
2890 gcc_assert (newdr != NULL && DR_REF (newdr));
2891 if (DR_BASE_ADDRESS (newdr)
2892 && DR_OFFSET (newdr)
2893 && DR_INIT (newdr)
2894 && DR_STEP (newdr)
2895 && integer_zerop (DR_STEP (newdr)))
2897 dr = newdr;
2898 gather = true;
2900 else
2901 free_data_ref (newdr);
2904 if (!gather)
2906 if (dump_enabled_p ())
2908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909 "not vectorized: data ref analysis "
2910 "failed ");
2911 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2914 if (bb_vinfo)
2915 break;
2917 return false;
2921 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
2923 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2925 "not vectorized: base addr of dr is a "
2926 "constant");
2928 if (bb_vinfo)
2929 break;
2931 if (gather)
2932 free_data_ref (dr);
2933 return false;
2936 if (TREE_THIS_VOLATILE (DR_REF (dr)))
2938 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2941 "not vectorized: volatile type ");
2942 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2945 if (bb_vinfo)
2946 break;
2948 return false;
2951 if (stmt_can_throw_internal (stmt))
2953 if (dump_enabled_p ())
2955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2956 "not vectorized: statement can throw an "
2957 "exception ");
2958 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2961 if (bb_vinfo)
2962 break;
2964 if (gather)
2965 free_data_ref (dr);
2966 return false;
2969 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
2970 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
2972 if (dump_enabled_p ())
2974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2975 "not vectorized: statement is bitfield "
2976 "access ");
2977 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2980 if (bb_vinfo)
2981 break;
2983 if (gather)
2984 free_data_ref (dr);
2985 return false;
2988 base = unshare_expr (DR_BASE_ADDRESS (dr));
2989 offset = unshare_expr (DR_OFFSET (dr));
2990 init = unshare_expr (DR_INIT (dr));
2992 if (is_gimple_call (stmt))
2994 if (dump_enabled_p ())
2996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2997 "not vectorized: dr in a call ");
2998 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3001 if (bb_vinfo)
3002 break;
3004 if (gather)
3005 free_data_ref (dr);
3006 return false;
3009 /* Update DR field in stmt_vec_info struct. */
3011 /* If the dataref is in an inner-loop of the loop that is considered for
3012 for vectorization, we also want to analyze the access relative to
3013 the outer-loop (DR contains information only relative to the
3014 inner-most enclosing loop). We do that by building a reference to the
3015 first location accessed by the inner-loop, and analyze it relative to
3016 the outer-loop. */
3017 if (loop && nested_in_vect_loop_p (loop, stmt))
3019 tree outer_step, outer_base, outer_init;
3020 HOST_WIDE_INT pbitsize, pbitpos;
3021 tree poffset;
3022 enum machine_mode pmode;
3023 int punsignedp, pvolatilep;
3024 affine_iv base_iv, offset_iv;
3025 tree dinit;
3027 /* Build a reference to the first location accessed by the
3028 inner-loop: *(BASE+INIT). (The first location is actually
3029 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3030 tree inner_base = build_fold_indirect_ref
3031 (fold_build_pointer_plus (base, init));
3033 if (dump_enabled_p ())
3035 dump_printf_loc (MSG_NOTE, vect_location,
3036 "analyze in outer-loop: ");
3037 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3040 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3041 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3042 gcc_assert (outer_base != NULL_TREE);
3044 if (pbitpos % BITS_PER_UNIT != 0)
3046 if (dump_enabled_p ())
3047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3048 "failed: bit offset alignment.\n");
3049 return false;
3052 outer_base = build_fold_addr_expr (outer_base);
3053 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3054 &base_iv, false))
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058 "failed: evolution of base is not affine.\n");
3059 return false;
3062 if (offset)
3064 if (poffset)
3065 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3066 poffset);
3067 else
3068 poffset = offset;
3071 if (!poffset)
3073 offset_iv.base = ssize_int (0);
3074 offset_iv.step = ssize_int (0);
3076 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3077 &offset_iv, false))
3079 if (dump_enabled_p ())
3080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3081 "evolution of offset is not affine.\n");
3082 return false;
3085 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3086 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3087 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3088 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3089 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3091 outer_step = size_binop (PLUS_EXPR,
3092 fold_convert (ssizetype, base_iv.step),
3093 fold_convert (ssizetype, offset_iv.step));
3095 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3096 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3097 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3098 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3099 STMT_VINFO_DR_OFFSET (stmt_info) =
3100 fold_convert (ssizetype, offset_iv.base);
3101 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3102 size_int (highest_pow2_factor (offset_iv.base));
3104 if (dump_enabled_p ())
3106 dump_printf_loc (MSG_NOTE, vect_location,
3107 "\touter base_address: ");
3108 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3109 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3110 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3111 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3112 STMT_VINFO_DR_OFFSET (stmt_info));
3113 dump_printf (MSG_NOTE,
3114 "\n\touter constant offset from base address: ");
3115 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3116 STMT_VINFO_DR_INIT (stmt_info));
3117 dump_printf (MSG_NOTE, "\n\touter step: ");
3118 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3119 STMT_VINFO_DR_STEP (stmt_info));
3120 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3121 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3122 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3126 if (STMT_VINFO_DATA_REF (stmt_info))
3128 if (dump_enabled_p ())
3130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3131 "not vectorized: more than one data ref "
3132 "in stmt: ");
3133 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3136 if (bb_vinfo)
3137 break;
3139 if (gather)
3140 free_data_ref (dr);
3141 return false;
3144 STMT_VINFO_DATA_REF (stmt_info) = dr;
3146 /* Set vectype for STMT. */
3147 scalar_type = TREE_TYPE (DR_REF (dr));
3148 STMT_VINFO_VECTYPE (stmt_info) =
3149 get_vectype_for_scalar_type (scalar_type);
3150 if (!STMT_VINFO_VECTYPE (stmt_info))
3152 if (dump_enabled_p ())
3154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3155 "not vectorized: no vectype for stmt: ");
3156 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3157 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3158 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3159 scalar_type);
3162 if (bb_vinfo)
3163 break;
3165 if (gather)
3167 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3168 free_data_ref (dr);
3170 return false;
3172 else
3174 if (dump_enabled_p ())
3176 dump_printf_loc (MSG_NOTE, vect_location,
3177 "got vectype for stmt: ");
3178 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3179 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3180 STMT_VINFO_VECTYPE (stmt_info));
3184 /* Adjust the minimal vectorization factor according to the
3185 vector type. */
3186 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3187 if (vf > *min_vf)
3188 *min_vf = vf;
3190 if (gather)
3192 tree off;
3194 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3195 if (gather
3196 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3197 gather = false;
3198 if (!gather)
3200 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3201 free_data_ref (dr);
3202 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3205 "not vectorized: not suitable for gather "
3206 "load ");
3207 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3209 return false;
3212 datarefs[i] = dr;
3213 STMT_VINFO_GATHER_P (stmt_info) = true;
3215 else if (loop_vinfo
3216 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3218 if (nested_in_vect_loop_p (loop, stmt)
3219 || !DR_IS_READ (dr))
3221 if (dump_enabled_p ())
3223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3224 "not vectorized: not suitable for strided "
3225 "load ");
3226 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3228 return false;
3230 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3234 /* If we stopped analysis at the first dataref we could not analyze
3235 when trying to vectorize a basic-block mark the rest of the datarefs
3236 as not vectorizable and truncate the vector of datarefs. That
3237 avoids spending useless time in analyzing their dependence. */
3238 if (i != datarefs.length ())
3240 gcc_assert (bb_vinfo != NULL);
3241 for (unsigned j = i; j < datarefs.length (); ++j)
3243 data_reference_p dr = datarefs[j];
3244 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3245 free_data_ref (dr);
3247 datarefs.truncate (i);
3250 return true;
3254 /* Function vect_get_new_vect_var.
3256 Returns a name for a new variable. The current naming scheme appends the
3257 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3258 the name of vectorizer generated variables, and appends that to NAME if
3259 provided. */
3261 tree
3262 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3264 const char *prefix;
3265 tree new_vect_var;
3267 switch (var_kind)
3269 case vect_simple_var:
3270 prefix = "vect";
3271 break;
3272 case vect_scalar_var:
3273 prefix = "stmp";
3274 break;
3275 case vect_pointer_var:
3276 prefix = "vectp";
3277 break;
3278 default:
3279 gcc_unreachable ();
3282 if (name)
3284 char* tmp = concat (prefix, "_", name, NULL);
3285 new_vect_var = create_tmp_reg (type, tmp);
3286 free (tmp);
3288 else
3289 new_vect_var = create_tmp_reg (type, prefix);
3291 return new_vect_var;
3295 /* Function vect_create_addr_base_for_vector_ref.
3297 Create an expression that computes the address of the first memory location
3298 that will be accessed for a data reference.
3300 Input:
3301 STMT: The statement containing the data reference.
3302 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3303 OFFSET: Optional. If supplied, it is be added to the initial address.
3304 LOOP: Specify relative to which loop-nest should the address be computed.
3305 For example, when the dataref is in an inner-loop nested in an
3306 outer-loop that is now being vectorized, LOOP can be either the
3307 outer-loop, or the inner-loop. The first memory location accessed
3308 by the following dataref ('in' points to short):
3310 for (i=0; i<N; i++)
3311 for (j=0; j<M; j++)
3312 s += in[i+j]
3314 is as follows:
3315 if LOOP=i_loop: &in (relative to i_loop)
3316 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3318 Output:
3319 1. Return an SSA_NAME whose value is the address of the memory location of
3320 the first vector of the data reference.
3321 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3322 these statement(s) which define the returned SSA_NAME.
3324 FORNOW: We are only handling array accesses with step 1. */
3326 tree
3327 vect_create_addr_base_for_vector_ref (gimple stmt,
3328 gimple_seq *new_stmt_list,
3329 tree offset,
3330 struct loop *loop)
3332 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3333 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3334 tree data_ref_base;
3335 const char *base_name;
3336 tree addr_base;
3337 tree dest;
3338 gimple_seq seq = NULL;
3339 tree base_offset;
3340 tree init;
3341 tree vect_ptr_type;
3342 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3343 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3345 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3347 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3349 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3351 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3352 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3353 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3355 else
3357 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3358 base_offset = unshare_expr (DR_OFFSET (dr));
3359 init = unshare_expr (DR_INIT (dr));
3362 if (loop_vinfo)
3363 base_name = get_name (data_ref_base);
3364 else
3366 base_offset = ssize_int (0);
3367 init = ssize_int (0);
3368 base_name = get_name (DR_REF (dr));
3371 /* Create base_offset */
3372 base_offset = size_binop (PLUS_EXPR,
3373 fold_convert (sizetype, base_offset),
3374 fold_convert (sizetype, init));
3376 if (offset)
3378 offset = fold_build2 (MULT_EXPR, sizetype,
3379 fold_convert (sizetype, offset), step);
3380 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3381 base_offset, offset);
3384 /* base + base_offset */
3385 if (loop_vinfo)
3386 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3387 else
3389 addr_base = build1 (ADDR_EXPR,
3390 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3391 unshare_expr (DR_REF (dr)));
3394 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3395 addr_base = fold_convert (vect_ptr_type, addr_base);
3396 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3397 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3398 gimple_seq_add_seq (new_stmt_list, seq);
3400 if (DR_PTR_INFO (dr)
3401 && TREE_CODE (addr_base) == SSA_NAME)
3403 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3404 if (offset)
3405 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3408 if (dump_enabled_p ())
3410 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3411 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3414 return addr_base;
3418 /* Function vect_create_data_ref_ptr.
3420 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3421 location accessed in the loop by STMT, along with the def-use update
3422 chain to appropriately advance the pointer through the loop iterations.
3423 Also set aliasing information for the pointer. This pointer is used by
3424 the callers to this function to create a memory reference expression for
3425 vector load/store access.
3427 Input:
3428 1. STMT: a stmt that references memory. Expected to be of the form
3429 GIMPLE_ASSIGN <name, data-ref> or
3430 GIMPLE_ASSIGN <data-ref, name>.
3431 2. AGGR_TYPE: the type of the reference, which should be either a vector
3432 or an array.
3433 3. AT_LOOP: the loop where the vector memref is to be created.
3434 4. OFFSET (optional): an offset to be added to the initial address accessed
3435 by the data-ref in STMT.
3436 5. BSI: location where the new stmts are to be placed if there is no loop
3437 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3438 pointing to the initial address.
3440 Output:
3441 1. Declare a new ptr to vector_type, and have it point to the base of the
3442 data reference (initial addressed accessed by the data reference).
3443 For example, for vector of type V8HI, the following code is generated:
3445 v8hi *ap;
3446 ap = (v8hi *)initial_address;
3448 if OFFSET is not supplied:
3449 initial_address = &a[init];
3450 if OFFSET is supplied:
3451 initial_address = &a[init + OFFSET];
3453 Return the initial_address in INITIAL_ADDRESS.
3455 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3456 update the pointer in each iteration of the loop.
3458 Return the increment stmt that updates the pointer in PTR_INCR.
3460 3. Set INV_P to true if the access pattern of the data reference in the
3461 vectorized loop is invariant. Set it to false otherwise.
3463 4. Return the pointer. */
3465 tree
3466 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3467 tree offset, tree *initial_address,
3468 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3469 bool only_init, bool *inv_p)
3471 const char *base_name;
3472 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3474 struct loop *loop = NULL;
3475 bool nested_in_vect_loop = false;
3476 struct loop *containing_loop = NULL;
3477 tree aggr_ptr_type;
3478 tree aggr_ptr;
3479 tree new_temp;
3480 gimple vec_stmt;
3481 gimple_seq new_stmt_list = NULL;
3482 edge pe = NULL;
3483 basic_block new_bb;
3484 tree aggr_ptr_init;
3485 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3486 tree aptr;
3487 gimple_stmt_iterator incr_gsi;
3488 bool insert_after;
3489 tree indx_before_incr, indx_after_incr;
3490 gimple incr;
3491 tree step;
3492 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3494 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3495 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3497 if (loop_vinfo)
3499 loop = LOOP_VINFO_LOOP (loop_vinfo);
3500 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3501 containing_loop = (gimple_bb (stmt))->loop_father;
3502 pe = loop_preheader_edge (loop);
3504 else
3506 gcc_assert (bb_vinfo);
3507 only_init = true;
3508 *ptr_incr = NULL;
3511 /* Check the step (evolution) of the load in LOOP, and record
3512 whether it's invariant. */
3513 if (nested_in_vect_loop)
3514 step = STMT_VINFO_DR_STEP (stmt_info);
3515 else
3516 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3518 if (integer_zerop (step))
3519 *inv_p = true;
3520 else
3521 *inv_p = false;
3523 /* Create an expression for the first address accessed by this load
3524 in LOOP. */
3525 base_name = get_name (DR_BASE_ADDRESS (dr));
3527 if (dump_enabled_p ())
3529 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3530 dump_printf_loc (MSG_NOTE, vect_location,
3531 "create %s-pointer variable to type: ",
3532 tree_code_name[(int) TREE_CODE (aggr_type)]);
3533 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3534 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3535 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3536 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3537 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3538 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3539 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3540 else
3541 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3545 /* (1) Create the new aggregate-pointer variable.
3546 Vector and array types inherit the alias set of their component
3547 type by default so we need to use a ref-all pointer if the data
3548 reference does not conflict with the created aggregated data
3549 reference because it is not addressable. */
3550 bool need_ref_all = false;
3551 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3552 get_alias_set (DR_REF (dr))))
3553 need_ref_all = true;
3554 /* Likewise for any of the data references in the stmt group. */
3555 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3557 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3560 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3561 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3562 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3563 get_alias_set (DR_REF (sdr))))
3565 need_ref_all = true;
3566 break;
3568 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3570 while (orig_stmt);
3572 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3573 need_ref_all);
3574 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3577 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3578 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3579 def-use update cycles for the pointer: one relative to the outer-loop
3580 (LOOP), which is what steps (3) and (4) below do. The other is relative
3581 to the inner-loop (which is the inner-most loop containing the dataref),
3582 and this is done be step (5) below.
3584 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3585 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3586 redundant. Steps (3),(4) create the following:
3588 vp0 = &base_addr;
3589 LOOP: vp1 = phi(vp0,vp2)
3592 vp2 = vp1 + step
3593 goto LOOP
3595 If there is an inner-loop nested in loop, then step (5) will also be
3596 applied, and an additional update in the inner-loop will be created:
3598 vp0 = &base_addr;
3599 LOOP: vp1 = phi(vp0,vp2)
3601 inner: vp3 = phi(vp1,vp4)
3602 vp4 = vp3 + inner_step
3603 if () goto inner
3605 vp2 = vp1 + step
3606 if () goto LOOP */
3608 /* (2) Calculate the initial address of the aggregate-pointer, and set
3609 the aggregate-pointer to point to it before the loop. */
3611 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3613 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3614 offset, loop);
3615 if (new_stmt_list)
3617 if (pe)
3619 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3620 gcc_assert (!new_bb);
3622 else
3623 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3626 *initial_address = new_temp;
3628 /* Create: p = (aggr_type *) initial_base */
3629 if (TREE_CODE (new_temp) != SSA_NAME
3630 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3632 vec_stmt = gimple_build_assign (aggr_ptr,
3633 fold_convert (aggr_ptr_type, new_temp));
3634 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3635 /* Copy the points-to information if it exists. */
3636 if (DR_PTR_INFO (dr))
3637 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3638 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3639 if (pe)
3641 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3642 gcc_assert (!new_bb);
3644 else
3645 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3647 else
3648 aggr_ptr_init = new_temp;
3650 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3651 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3652 inner-loop nested in LOOP (during outer-loop vectorization). */
3654 /* No update in loop is required. */
3655 if (only_init && (!loop_vinfo || at_loop == loop))
3656 aptr = aggr_ptr_init;
3657 else
3659 /* The step of the aggregate pointer is the type size. */
3660 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3661 /* One exception to the above is when the scalar step of the load in
3662 LOOP is zero. In this case the step here is also zero. */
3663 if (*inv_p)
3664 iv_step = size_zero_node;
3665 else if (tree_int_cst_sgn (step) == -1)
3666 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3668 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3670 create_iv (aggr_ptr_init,
3671 fold_convert (aggr_ptr_type, iv_step),
3672 aggr_ptr, loop, &incr_gsi, insert_after,
3673 &indx_before_incr, &indx_after_incr);
3674 incr = gsi_stmt (incr_gsi);
3675 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3677 /* Copy the points-to information if it exists. */
3678 if (DR_PTR_INFO (dr))
3680 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3681 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3683 if (ptr_incr)
3684 *ptr_incr = incr;
3686 aptr = indx_before_incr;
3689 if (!nested_in_vect_loop || only_init)
3690 return aptr;
3693 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3694 nested in LOOP, if exists. */
3696 gcc_assert (nested_in_vect_loop);
3697 if (!only_init)
3699 standard_iv_increment_position (containing_loop, &incr_gsi,
3700 &insert_after);
3701 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3702 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3703 &indx_after_incr);
3704 incr = gsi_stmt (incr_gsi);
3705 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3707 /* Copy the points-to information if it exists. */
3708 if (DR_PTR_INFO (dr))
3710 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3711 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3713 if (ptr_incr)
3714 *ptr_incr = incr;
3716 return indx_before_incr;
3718 else
3719 gcc_unreachable ();
3723 /* Function bump_vector_ptr
3725 Increment a pointer (to a vector type) by vector-size. If requested,
3726 i.e. if PTR-INCR is given, then also connect the new increment stmt
3727 to the existing def-use update-chain of the pointer, by modifying
3728 the PTR_INCR as illustrated below:
3730 The pointer def-use update-chain before this function:
3731 DATAREF_PTR = phi (p_0, p_2)
3732 ....
3733 PTR_INCR: p_2 = DATAREF_PTR + step
3735 The pointer def-use update-chain after this function:
3736 DATAREF_PTR = phi (p_0, p_2)
3737 ....
3738 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3739 ....
3740 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3742 Input:
3743 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3744 in the loop.
3745 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3746 the loop. The increment amount across iterations is expected
3747 to be vector_size.
3748 BSI - location where the new update stmt is to be placed.
3749 STMT - the original scalar memory-access stmt that is being vectorized.
3750 BUMP - optional. The offset by which to bump the pointer. If not given,
3751 the offset is assumed to be vector_size.
3753 Output: Return NEW_DATAREF_PTR as illustrated above.
3757 tree
3758 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3759 gimple stmt, tree bump)
3761 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3762 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3763 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3764 tree update = TYPE_SIZE_UNIT (vectype);
3765 gimple incr_stmt;
3766 ssa_op_iter iter;
3767 use_operand_p use_p;
3768 tree new_dataref_ptr;
3770 if (bump)
3771 update = bump;
3773 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3774 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3775 dataref_ptr, update);
3776 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3778 /* Copy the points-to information if it exists. */
3779 if (DR_PTR_INFO (dr))
3781 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3782 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3785 if (!ptr_incr)
3786 return new_dataref_ptr;
3788 /* Update the vector-pointer's cross-iteration increment. */
3789 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3791 tree use = USE_FROM_PTR (use_p);
3793 if (use == dataref_ptr)
3794 SET_USE (use_p, new_dataref_ptr);
3795 else
3796 gcc_assert (tree_int_cst_compare (use, update) == 0);
3799 return new_dataref_ptr;
3803 /* Function vect_create_destination_var.
3805 Create a new temporary of type VECTYPE. */
3807 tree
3808 vect_create_destination_var (tree scalar_dest, tree vectype)
3810 tree vec_dest;
3811 const char *name;
3812 char *new_name;
3813 tree type;
3814 enum vect_var_kind kind;
3816 kind = vectype ? vect_simple_var : vect_scalar_var;
3817 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3819 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3821 name = get_name (scalar_dest);
3822 if (name)
3823 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
3824 else
3825 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
3826 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3827 free (new_name);
3829 return vec_dest;
3832 /* Function vect_grouped_store_supported.
3834 Returns TRUE if interleave high and interleave low permutations
3835 are supported, and FALSE otherwise. */
3837 bool
3838 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3840 enum machine_mode mode = TYPE_MODE (vectype);
3842 /* vect_permute_store_chain requires the group size to be a power of two. */
3843 if (exact_log2 (count) == -1)
3845 if (dump_enabled_p ())
3846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3847 "the size of the group of accesses"
3848 " is not a power of 2");
3849 return false;
3852 /* Check that the permutation is supported. */
3853 if (VECTOR_MODE_P (mode))
3855 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3856 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3857 for (i = 0; i < nelt / 2; i++)
3859 sel[i * 2] = i;
3860 sel[i * 2 + 1] = i + nelt;
3862 if (can_vec_perm_p (mode, false, sel))
3864 for (i = 0; i < nelt; i++)
3865 sel[i] += nelt / 2;
3866 if (can_vec_perm_p (mode, false, sel))
3867 return true;
3871 if (dump_enabled_p ())
3872 dump_printf (MSG_MISSED_OPTIMIZATION,
3873 "interleave op not supported by target.");
3874 return false;
3878 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3879 type VECTYPE. */
3881 bool
3882 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3884 return vect_lanes_optab_supported_p ("vec_store_lanes",
3885 vec_store_lanes_optab,
3886 vectype, count);
3890 /* Function vect_permute_store_chain.
3892 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3893 a power of 2, generate interleave_high/low stmts to reorder the data
3894 correctly for the stores. Return the final references for stores in
3895 RESULT_CHAIN.
3897 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3898 The input is 4 vectors each containing 8 elements. We assign a number to
3899 each element, the input sequence is:
3901 1st vec: 0 1 2 3 4 5 6 7
3902 2nd vec: 8 9 10 11 12 13 14 15
3903 3rd vec: 16 17 18 19 20 21 22 23
3904 4th vec: 24 25 26 27 28 29 30 31
3906 The output sequence should be:
3908 1st vec: 0 8 16 24 1 9 17 25
3909 2nd vec: 2 10 18 26 3 11 19 27
3910 3rd vec: 4 12 20 28 5 13 21 30
3911 4th vec: 6 14 22 30 7 15 23 31
3913 i.e., we interleave the contents of the four vectors in their order.
3915 We use interleave_high/low instructions to create such output. The input of
3916 each interleave_high/low operation is two vectors:
3917 1st vec 2nd vec
3918 0 1 2 3 4 5 6 7
3919 the even elements of the result vector are obtained left-to-right from the
3920 high/low elements of the first vector. The odd elements of the result are
3921 obtained left-to-right from the high/low elements of the second vector.
3922 The output of interleave_high will be: 0 4 1 5
3923 and of interleave_low: 2 6 3 7
3926 The permutation is done in log LENGTH stages. In each stage interleave_high
3927 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3928 where the first argument is taken from the first half of DR_CHAIN and the
3929 second argument from it's second half.
3930 In our example,
3932 I1: interleave_high (1st vec, 3rd vec)
3933 I2: interleave_low (1st vec, 3rd vec)
3934 I3: interleave_high (2nd vec, 4th vec)
3935 I4: interleave_low (2nd vec, 4th vec)
3937 The output for the first stage is:
3939 I1: 0 16 1 17 2 18 3 19
3940 I2: 4 20 5 21 6 22 7 23
3941 I3: 8 24 9 25 10 26 11 27
3942 I4: 12 28 13 29 14 30 15 31
3944 The output of the second stage, i.e. the final result is:
3946 I1: 0 8 16 24 1 9 17 25
3947 I2: 2 10 18 26 3 11 19 27
3948 I3: 4 12 20 28 5 13 21 30
3949 I4: 6 14 22 30 7 15 23 31. */
3951 void
3952 vect_permute_store_chain (vec<tree> dr_chain,
3953 unsigned int length,
3954 gimple stmt,
3955 gimple_stmt_iterator *gsi,
3956 vec<tree> *result_chain)
3958 tree vect1, vect2, high, low;
3959 gimple perm_stmt;
3960 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3961 tree perm_mask_low, perm_mask_high;
3962 unsigned int i, n;
3963 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
3964 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3966 result_chain->quick_grow (length);
3967 memcpy (result_chain->address (), dr_chain.address (),
3968 length * sizeof (tree));
3970 for (i = 0, n = nelt / 2; i < n; i++)
3972 sel[i * 2] = i;
3973 sel[i * 2 + 1] = i + nelt;
3975 perm_mask_high = vect_gen_perm_mask (vectype, sel);
3976 gcc_assert (perm_mask_high != NULL);
3978 for (i = 0; i < nelt; i++)
3979 sel[i] += nelt / 2;
3980 perm_mask_low = vect_gen_perm_mask (vectype, sel);
3981 gcc_assert (perm_mask_low != NULL);
3983 for (i = 0, n = exact_log2 (length); i < n; i++)
3985 for (j = 0; j < length/2; j++)
3987 vect1 = dr_chain[j];
3988 vect2 = dr_chain[j+length/2];
3990 /* Create interleaving stmt:
3991 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
3992 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
3993 perm_stmt
3994 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
3995 vect1, vect2, perm_mask_high);
3996 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3997 (*result_chain)[2*j] = high;
3999 /* Create interleaving stmt:
4000 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4001 nelt*3/2+1, ...}> */
4002 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4003 perm_stmt
4004 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4005 vect1, vect2, perm_mask_low);
4006 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4007 (*result_chain)[2*j+1] = low;
4009 memcpy (dr_chain.address (), result_chain->address (),
4010 length * sizeof (tree));
4014 /* Function vect_setup_realignment
4016 This function is called when vectorizing an unaligned load using
4017 the dr_explicit_realign[_optimized] scheme.
4018 This function generates the following code at the loop prolog:
4020 p = initial_addr;
4021 x msq_init = *(floor(p)); # prolog load
4022 realignment_token = call target_builtin;
4023 loop:
4024 x msq = phi (msq_init, ---)
4026 The stmts marked with x are generated only for the case of
4027 dr_explicit_realign_optimized.
4029 The code above sets up a new (vector) pointer, pointing to the first
4030 location accessed by STMT, and a "floor-aligned" load using that pointer.
4031 It also generates code to compute the "realignment-token" (if the relevant
4032 target hook was defined), and creates a phi-node at the loop-header bb
4033 whose arguments are the result of the prolog-load (created by this
4034 function) and the result of a load that takes place in the loop (to be
4035 created by the caller to this function).
4037 For the case of dr_explicit_realign_optimized:
4038 The caller to this function uses the phi-result (msq) to create the
4039 realignment code inside the loop, and sets up the missing phi argument,
4040 as follows:
4041 loop:
4042 msq = phi (msq_init, lsq)
4043 lsq = *(floor(p')); # load in loop
4044 result = realign_load (msq, lsq, realignment_token);
4046 For the case of dr_explicit_realign:
4047 loop:
4048 msq = *(floor(p)); # load in loop
4049 p' = p + (VS-1);
4050 lsq = *(floor(p')); # load in loop
4051 result = realign_load (msq, lsq, realignment_token);
4053 Input:
4054 STMT - (scalar) load stmt to be vectorized. This load accesses
4055 a memory location that may be unaligned.
4056 BSI - place where new code is to be inserted.
4057 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4058 is used.
4060 Output:
4061 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4062 target hook, if defined.
4063 Return value - the result of the loop-header phi node. */
4065 tree
4066 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4067 tree *realignment_token,
4068 enum dr_alignment_support alignment_support_scheme,
4069 tree init_addr,
4070 struct loop **at_loop)
4072 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4073 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4074 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4075 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4076 struct loop *loop = NULL;
4077 edge pe = NULL;
4078 tree scalar_dest = gimple_assign_lhs (stmt);
4079 tree vec_dest;
4080 gimple inc;
4081 tree ptr;
4082 tree data_ref;
4083 gimple new_stmt;
4084 basic_block new_bb;
4085 tree msq_init = NULL_TREE;
4086 tree new_temp;
4087 gimple phi_stmt;
4088 tree msq = NULL_TREE;
4089 gimple_seq stmts = NULL;
4090 bool inv_p;
4091 bool compute_in_loop = false;
4092 bool nested_in_vect_loop = false;
4093 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4094 struct loop *loop_for_initial_load = NULL;
4096 if (loop_vinfo)
4098 loop = LOOP_VINFO_LOOP (loop_vinfo);
4099 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4102 gcc_assert (alignment_support_scheme == dr_explicit_realign
4103 || alignment_support_scheme == dr_explicit_realign_optimized);
4105 /* We need to generate three things:
4106 1. the misalignment computation
4107 2. the extra vector load (for the optimized realignment scheme).
4108 3. the phi node for the two vectors from which the realignment is
4109 done (for the optimized realignment scheme). */
4111 /* 1. Determine where to generate the misalignment computation.
4113 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4114 calculation will be generated by this function, outside the loop (in the
4115 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4116 caller, inside the loop.
4118 Background: If the misalignment remains fixed throughout the iterations of
4119 the loop, then both realignment schemes are applicable, and also the
4120 misalignment computation can be done outside LOOP. This is because we are
4121 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4122 are a multiple of VS (the Vector Size), and therefore the misalignment in
4123 different vectorized LOOP iterations is always the same.
4124 The problem arises only if the memory access is in an inner-loop nested
4125 inside LOOP, which is now being vectorized using outer-loop vectorization.
4126 This is the only case when the misalignment of the memory access may not
4127 remain fixed throughout the iterations of the inner-loop (as explained in
4128 detail in vect_supportable_dr_alignment). In this case, not only is the
4129 optimized realignment scheme not applicable, but also the misalignment
4130 computation (and generation of the realignment token that is passed to
4131 REALIGN_LOAD) have to be done inside the loop.
4133 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4134 or not, which in turn determines if the misalignment is computed inside
4135 the inner-loop, or outside LOOP. */
4137 if (init_addr != NULL_TREE || !loop_vinfo)
4139 compute_in_loop = true;
4140 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4144 /* 2. Determine where to generate the extra vector load.
4146 For the optimized realignment scheme, instead of generating two vector
4147 loads in each iteration, we generate a single extra vector load in the
4148 preheader of the loop, and in each iteration reuse the result of the
4149 vector load from the previous iteration. In case the memory access is in
4150 an inner-loop nested inside LOOP, which is now being vectorized using
4151 outer-loop vectorization, we need to determine whether this initial vector
4152 load should be generated at the preheader of the inner-loop, or can be
4153 generated at the preheader of LOOP. If the memory access has no evolution
4154 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4155 to be generated inside LOOP (in the preheader of the inner-loop). */
4157 if (nested_in_vect_loop)
4159 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4160 bool invariant_in_outerloop =
4161 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4162 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4164 else
4165 loop_for_initial_load = loop;
4166 if (at_loop)
4167 *at_loop = loop_for_initial_load;
4169 if (loop_for_initial_load)
4170 pe = loop_preheader_edge (loop_for_initial_load);
4172 /* 3. For the case of the optimized realignment, create the first vector
4173 load at the loop preheader. */
4175 if (alignment_support_scheme == dr_explicit_realign_optimized)
4177 /* Create msq_init = *(floor(p1)) in the loop preheader */
4179 gcc_assert (!compute_in_loop);
4180 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4181 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4182 NULL_TREE, &init_addr, NULL, &inc,
4183 true, &inv_p);
4184 new_temp = copy_ssa_name (ptr, NULL);
4185 new_stmt = gimple_build_assign_with_ops
4186 (BIT_AND_EXPR, new_temp, ptr,
4187 build_int_cst (TREE_TYPE (ptr),
4188 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4189 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4190 gcc_assert (!new_bb);
4191 data_ref
4192 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4193 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4194 new_stmt = gimple_build_assign (vec_dest, data_ref);
4195 new_temp = make_ssa_name (vec_dest, new_stmt);
4196 gimple_assign_set_lhs (new_stmt, new_temp);
4197 if (pe)
4199 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4200 gcc_assert (!new_bb);
4202 else
4203 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4205 msq_init = gimple_assign_lhs (new_stmt);
4208 /* 4. Create realignment token using a target builtin, if available.
4209 It is done either inside the containing loop, or before LOOP (as
4210 determined above). */
4212 if (targetm.vectorize.builtin_mask_for_load)
4214 tree builtin_decl;
4216 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4217 if (!init_addr)
4219 /* Generate the INIT_ADDR computation outside LOOP. */
4220 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4221 NULL_TREE, loop);
4222 if (loop)
4224 pe = loop_preheader_edge (loop);
4225 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4226 gcc_assert (!new_bb);
4228 else
4229 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4232 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4233 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4234 vec_dest =
4235 vect_create_destination_var (scalar_dest,
4236 gimple_call_return_type (new_stmt));
4237 new_temp = make_ssa_name (vec_dest, new_stmt);
4238 gimple_call_set_lhs (new_stmt, new_temp);
4240 if (compute_in_loop)
4241 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4242 else
4244 /* Generate the misalignment computation outside LOOP. */
4245 pe = loop_preheader_edge (loop);
4246 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4247 gcc_assert (!new_bb);
4250 *realignment_token = gimple_call_lhs (new_stmt);
4252 /* The result of the CALL_EXPR to this builtin is determined from
4253 the value of the parameter and no global variables are touched
4254 which makes the builtin a "const" function. Requiring the
4255 builtin to have the "const" attribute makes it unnecessary
4256 to call mark_call_clobbered. */
4257 gcc_assert (TREE_READONLY (builtin_decl));
4260 if (alignment_support_scheme == dr_explicit_realign)
4261 return msq;
4263 gcc_assert (!compute_in_loop);
4264 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4267 /* 5. Create msq = phi <msq_init, lsq> in loop */
4269 pe = loop_preheader_edge (containing_loop);
4270 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4271 msq = make_ssa_name (vec_dest, NULL);
4272 phi_stmt = create_phi_node (msq, containing_loop->header);
4273 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4275 return msq;
4279 /* Function vect_grouped_load_supported.
4281 Returns TRUE if even and odd permutations are supported,
4282 and FALSE otherwise. */
4284 bool
4285 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4287 enum machine_mode mode = TYPE_MODE (vectype);
4289 /* vect_permute_load_chain requires the group size to be a power of two. */
4290 if (exact_log2 (count) == -1)
4292 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4294 "the size of the group of accesses"
4295 " is not a power of 2");
4296 return false;
4299 /* Check that the permutation is supported. */
4300 if (VECTOR_MODE_P (mode))
4302 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4303 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4305 for (i = 0; i < nelt; i++)
4306 sel[i] = i * 2;
4307 if (can_vec_perm_p (mode, false, sel))
4309 for (i = 0; i < nelt; i++)
4310 sel[i] = i * 2 + 1;
4311 if (can_vec_perm_p (mode, false, sel))
4312 return true;
4316 if (dump_enabled_p ())
4317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4318 "extract even/odd not supported by target");
4319 return false;
4322 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4323 type VECTYPE. */
4325 bool
4326 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4328 return vect_lanes_optab_supported_p ("vec_load_lanes",
4329 vec_load_lanes_optab,
4330 vectype, count);
4333 /* Function vect_permute_load_chain.
4335 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4336 a power of 2, generate extract_even/odd stmts to reorder the input data
4337 correctly. Return the final references for loads in RESULT_CHAIN.
4339 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4340 The input is 4 vectors each containing 8 elements. We assign a number to each
4341 element, the input sequence is:
4343 1st vec: 0 1 2 3 4 5 6 7
4344 2nd vec: 8 9 10 11 12 13 14 15
4345 3rd vec: 16 17 18 19 20 21 22 23
4346 4th vec: 24 25 26 27 28 29 30 31
4348 The output sequence should be:
4350 1st vec: 0 4 8 12 16 20 24 28
4351 2nd vec: 1 5 9 13 17 21 25 29
4352 3rd vec: 2 6 10 14 18 22 26 30
4353 4th vec: 3 7 11 15 19 23 27 31
4355 i.e., the first output vector should contain the first elements of each
4356 interleaving group, etc.
4358 We use extract_even/odd instructions to create such output. The input of
4359 each extract_even/odd operation is two vectors
4360 1st vec 2nd vec
4361 0 1 2 3 4 5 6 7
4363 and the output is the vector of extracted even/odd elements. The output of
4364 extract_even will be: 0 2 4 6
4365 and of extract_odd: 1 3 5 7
4368 The permutation is done in log LENGTH stages. In each stage extract_even
4369 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4370 their order. In our example,
4372 E1: extract_even (1st vec, 2nd vec)
4373 E2: extract_odd (1st vec, 2nd vec)
4374 E3: extract_even (3rd vec, 4th vec)
4375 E4: extract_odd (3rd vec, 4th vec)
4377 The output for the first stage will be:
4379 E1: 0 2 4 6 8 10 12 14
4380 E2: 1 3 5 7 9 11 13 15
4381 E3: 16 18 20 22 24 26 28 30
4382 E4: 17 19 21 23 25 27 29 31
4384 In order to proceed and create the correct sequence for the next stage (or
4385 for the correct output, if the second stage is the last one, as in our
4386 example), we first put the output of extract_even operation and then the
4387 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4388 The input for the second stage is:
4390 1st vec (E1): 0 2 4 6 8 10 12 14
4391 2nd vec (E3): 16 18 20 22 24 26 28 30
4392 3rd vec (E2): 1 3 5 7 9 11 13 15
4393 4th vec (E4): 17 19 21 23 25 27 29 31
4395 The output of the second stage:
4397 E1: 0 4 8 12 16 20 24 28
4398 E2: 2 6 10 14 18 22 26 30
4399 E3: 1 5 9 13 17 21 25 29
4400 E4: 3 7 11 15 19 23 27 31
4402 And RESULT_CHAIN after reordering:
4404 1st vec (E1): 0 4 8 12 16 20 24 28
4405 2nd vec (E3): 1 5 9 13 17 21 25 29
4406 3rd vec (E2): 2 6 10 14 18 22 26 30
4407 4th vec (E4): 3 7 11 15 19 23 27 31. */
4409 static void
4410 vect_permute_load_chain (vec<tree> dr_chain,
4411 unsigned int length,
4412 gimple stmt,
4413 gimple_stmt_iterator *gsi,
4414 vec<tree> *result_chain)
4416 tree data_ref, first_vect, second_vect;
4417 tree perm_mask_even, perm_mask_odd;
4418 gimple perm_stmt;
4419 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4420 unsigned int i, j, log_length = exact_log2 (length);
4421 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4422 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4424 result_chain->quick_grow (length);
4425 memcpy (result_chain->address (), dr_chain.address (),
4426 length * sizeof (tree));
4428 for (i = 0; i < nelt; ++i)
4429 sel[i] = i * 2;
4430 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4431 gcc_assert (perm_mask_even != NULL);
4433 for (i = 0; i < nelt; ++i)
4434 sel[i] = i * 2 + 1;
4435 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4436 gcc_assert (perm_mask_odd != NULL);
4438 for (i = 0; i < log_length; i++)
4440 for (j = 0; j < length; j += 2)
4442 first_vect = dr_chain[j];
4443 second_vect = dr_chain[j+1];
4445 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4446 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4447 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4448 first_vect, second_vect,
4449 perm_mask_even);
4450 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4451 (*result_chain)[j/2] = data_ref;
4453 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4454 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4455 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4456 first_vect, second_vect,
4457 perm_mask_odd);
4458 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4459 (*result_chain)[j/2+length/2] = data_ref;
4461 memcpy (dr_chain.address (), result_chain->address (),
4462 length * sizeof (tree));
4467 /* Function vect_transform_grouped_load.
4469 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4470 to perform their permutation and ascribe the result vectorized statements to
4471 the scalar statements.
4474 void
4475 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4476 gimple_stmt_iterator *gsi)
4478 vec<tree> result_chain = vNULL;
4480 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4481 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4482 vectors, that are ready for vector computation. */
4483 result_chain.create (size);
4484 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4485 vect_record_grouped_load_vectors (stmt, result_chain);
4486 result_chain.release ();
4489 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4490 generated as part of the vectorization of STMT. Assign the statement
4491 for each vector to the associated scalar statement. */
4493 void
4494 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4496 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4497 gimple next_stmt, new_stmt;
4498 unsigned int i, gap_count;
4499 tree tmp_data_ref;
4501 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4502 Since we scan the chain starting from it's first node, their order
4503 corresponds the order of data-refs in RESULT_CHAIN. */
4504 next_stmt = first_stmt;
4505 gap_count = 1;
4506 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4508 if (!next_stmt)
4509 break;
4511 /* Skip the gaps. Loads created for the gaps will be removed by dead
4512 code elimination pass later. No need to check for the first stmt in
4513 the group, since it always exists.
4514 GROUP_GAP is the number of steps in elements from the previous
4515 access (if there is no gap GROUP_GAP is 1). We skip loads that
4516 correspond to the gaps. */
4517 if (next_stmt != first_stmt
4518 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4520 gap_count++;
4521 continue;
4524 while (next_stmt)
4526 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4527 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4528 copies, and we put the new vector statement in the first available
4529 RELATED_STMT. */
4530 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4531 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4532 else
4534 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4536 gimple prev_stmt =
4537 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4538 gimple rel_stmt =
4539 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4540 while (rel_stmt)
4542 prev_stmt = rel_stmt;
4543 rel_stmt =
4544 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4547 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4548 new_stmt;
4552 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4553 gap_count = 1;
4554 /* If NEXT_STMT accesses the same DR as the previous statement,
4555 put the same TMP_DATA_REF as its vectorized statement; otherwise
4556 get the next data-ref from RESULT_CHAIN. */
4557 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4558 break;
4563 /* Function vect_force_dr_alignment_p.
4565 Returns whether the alignment of a DECL can be forced to be aligned
4566 on ALIGNMENT bit boundary. */
4568 bool
4569 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4571 if (TREE_CODE (decl) != VAR_DECL)
4572 return false;
4574 /* We cannot change alignment of common or external symbols as another
4575 translation unit may contain a definition with lower alignment.
4576 The rules of common symbol linking mean that the definition
4577 will override the common symbol. The same is true for constant
4578 pool entries which may be shared and are not properly merged
4579 by LTO. */
4580 if (DECL_EXTERNAL (decl)
4581 || DECL_COMMON (decl)
4582 || DECL_IN_CONSTANT_POOL (decl))
4583 return false;
4585 if (TREE_ASM_WRITTEN (decl))
4586 return false;
4588 /* Do not override the alignment as specified by the ABI when the used
4589 attribute is set. */
4590 if (DECL_PRESERVE_P (decl))
4591 return false;
4593 /* Do not override explicit alignment set by the user when an explicit
4594 section name is also used. This is a common idiom used by many
4595 software projects. */
4596 if (DECL_SECTION_NAME (decl) != NULL_TREE
4597 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4598 return false;
4600 if (TREE_STATIC (decl))
4601 return (alignment <= MAX_OFILE_ALIGNMENT);
4602 else
4603 return (alignment <= MAX_STACK_ALIGNMENT);
4607 /* Return whether the data reference DR is supported with respect to its
4608 alignment.
4609 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4610 it is aligned, i.e., check if it is possible to vectorize it with different
4611 alignment. */
4613 enum dr_alignment_support
4614 vect_supportable_dr_alignment (struct data_reference *dr,
4615 bool check_aligned_accesses)
4617 gimple stmt = DR_STMT (dr);
4618 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4619 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4620 enum machine_mode mode = TYPE_MODE (vectype);
4621 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4622 struct loop *vect_loop = NULL;
4623 bool nested_in_vect_loop = false;
4625 if (aligned_access_p (dr) && !check_aligned_accesses)
4626 return dr_aligned;
4628 if (loop_vinfo)
4630 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4631 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4634 /* Possibly unaligned access. */
4636 /* We can choose between using the implicit realignment scheme (generating
4637 a misaligned_move stmt) and the explicit realignment scheme (generating
4638 aligned loads with a REALIGN_LOAD). There are two variants to the
4639 explicit realignment scheme: optimized, and unoptimized.
4640 We can optimize the realignment only if the step between consecutive
4641 vector loads is equal to the vector size. Since the vector memory
4642 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4643 is guaranteed that the misalignment amount remains the same throughout the
4644 execution of the vectorized loop. Therefore, we can create the
4645 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4646 at the loop preheader.
4648 However, in the case of outer-loop vectorization, when vectorizing a
4649 memory access in the inner-loop nested within the LOOP that is now being
4650 vectorized, while it is guaranteed that the misalignment of the
4651 vectorized memory access will remain the same in different outer-loop
4652 iterations, it is *not* guaranteed that is will remain the same throughout
4653 the execution of the inner-loop. This is because the inner-loop advances
4654 with the original scalar step (and not in steps of VS). If the inner-loop
4655 step happens to be a multiple of VS, then the misalignment remains fixed
4656 and we can use the optimized realignment scheme. For example:
4658 for (i=0; i<N; i++)
4659 for (j=0; j<M; j++)
4660 s += a[i+j];
4662 When vectorizing the i-loop in the above example, the step between
4663 consecutive vector loads is 1, and so the misalignment does not remain
4664 fixed across the execution of the inner-loop, and the realignment cannot
4665 be optimized (as illustrated in the following pseudo vectorized loop):
4667 for (i=0; i<N; i+=4)
4668 for (j=0; j<M; j++){
4669 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4670 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4671 // (assuming that we start from an aligned address).
4674 We therefore have to use the unoptimized realignment scheme:
4676 for (i=0; i<N; i+=4)
4677 for (j=k; j<M; j+=4)
4678 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4679 // that the misalignment of the initial address is
4680 // 0).
4682 The loop can then be vectorized as follows:
4684 for (k=0; k<4; k++){
4685 rt = get_realignment_token (&vp[k]);
4686 for (i=0; i<N; i+=4){
4687 v1 = vp[i+k];
4688 for (j=k; j<M; j+=4){
4689 v2 = vp[i+j+VS-1];
4690 va = REALIGN_LOAD <v1,v2,rt>;
4691 vs += va;
4692 v1 = v2;
4695 } */
4697 if (DR_IS_READ (dr))
4699 bool is_packed = false;
4700 tree type = (TREE_TYPE (DR_REF (dr)));
4702 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4703 && (!targetm.vectorize.builtin_mask_for_load
4704 || targetm.vectorize.builtin_mask_for_load ()))
4706 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4707 if ((nested_in_vect_loop
4708 && (TREE_INT_CST_LOW (DR_STEP (dr))
4709 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4710 || !loop_vinfo)
4711 return dr_explicit_realign;
4712 else
4713 return dr_explicit_realign_optimized;
4715 if (!known_alignment_for_access_p (dr))
4716 is_packed = not_size_aligned (DR_REF (dr));
4718 if ((TYPE_USER_ALIGN (type) && !is_packed)
4719 || targetm.vectorize.
4720 support_vector_misalignment (mode, type,
4721 DR_MISALIGNMENT (dr), is_packed))
4722 /* Can't software pipeline the loads, but can at least do them. */
4723 return dr_unaligned_supported;
4725 else
4727 bool is_packed = false;
4728 tree type = (TREE_TYPE (DR_REF (dr)));
4730 if (!known_alignment_for_access_p (dr))
4731 is_packed = not_size_aligned (DR_REF (dr));
4733 if ((TYPE_USER_ALIGN (type) && !is_packed)
4734 || targetm.vectorize.
4735 support_vector_misalignment (mode, type,
4736 DR_MISALIGNMENT (dr), is_packed))
4737 return dr_unaligned_supported;
4740 /* Unsupported. */
4741 return dr_unaligned_unsupported;