2013-04-26 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobc1b5826ee1274cb90bd10afd3e71822bc86e544e
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
42 #include "expr.h"
43 #include "optabs.h"
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
48 static bool
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
52 enum machine_mode mode, array_mode;
53 bool limit_p;
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
58 MODE_INT, limit_p);
60 if (array_mode == BLKmode)
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
66 return false;
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
75 return false;
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
83 return true;
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
102 types. */
104 tree
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
122 if (rhs < lhs)
123 scalar_type = rhs_type;
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
128 return scalar_type;
132 /* Check if data references pointed by DR_I and DR_J are same or
133 belong to same interleaving group. Return FALSE if drs are
134 different, otherwise return TRUE. */
136 static bool
137 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
139 gimple stmt_i = DR_STMT (dr_i);
140 gimple stmt_j = DR_STMT (dr_j);
142 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
143 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
144 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
145 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
146 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
147 return true;
148 else
149 return false;
152 /* If address ranges represented by DDR_I and DDR_J are equal,
153 return TRUE, otherwise return FALSE. */
155 static bool
156 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
158 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
159 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
160 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
161 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
162 return true;
163 else
164 return false;
167 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
168 tested at run-time. Return TRUE if DDR was successfully inserted.
169 Return false if versioning is not supported. */
171 static bool
172 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
174 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
177 return false;
179 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location,
182 "mark for run-time aliasing test between ");
183 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
184 dump_printf (MSG_NOTE, " and ");
185 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
188 if (optimize_loop_nest_for_size_p (loop))
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192 "versioning not supported when optimizing for size.");
193 return false;
196 /* FORNOW: We don't support versioning with outer-loop vectorization. */
197 if (loop->inner)
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
201 "versioning not yet supported for outer-loops.");
202 return false;
205 /* FORNOW: We don't support creating runtime alias tests for non-constant
206 step. */
207 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
208 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
210 if (dump_enabled_p ())
211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
212 "versioning not yet supported for non-constant "
213 "step");
214 return false;
217 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
218 return true;
222 /* Function vect_analyze_data_ref_dependence.
224 Return TRUE if there (might) exist a dependence between a memory-reference
225 DRA and a memory-reference DRB. When versioning for alias may check a
226 dependence at run-time, return FALSE. Adjust *MAX_VF according to
227 the data dependence. */
229 static bool
230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
231 loop_vec_info loop_vinfo, int *max_vf)
233 unsigned int i;
234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
235 struct data_reference *dra = DDR_A (ddr);
236 struct data_reference *drb = DDR_B (ddr);
237 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
238 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
239 lambda_vector dist_v;
240 unsigned int loop_depth;
242 /* In loop analysis all data references should be vectorizable. */
243 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
244 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
245 gcc_unreachable ();
247 /* Independent data accesses. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
249 return false;
251 if (dra == drb
252 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
253 return false;
255 /* Unknown data dependence. */
256 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
258 if (STMT_VINFO_GATHER_P (stmtinfo_a)
259 || STMT_VINFO_GATHER_P (stmtinfo_b))
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
264 "versioning for alias not supported for: "
265 "can't determine dependence between ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (dra));
268 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (drb));
272 return false;
275 if (dump_enabled_p ())
277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
278 "versioning for alias required: "
279 "can't determine dependence between ");
280 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
281 DR_REF (dra));
282 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (drb));
287 /* Add to list of ddrs that need to be tested at run-time. */
288 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
291 /* Known data dependence. */
292 if (DDR_NUM_DIST_VECTS (ddr) == 0)
294 if (STMT_VINFO_GATHER_P (stmtinfo_a)
295 || STMT_VINFO_GATHER_P (stmtinfo_b))
297 if (dump_enabled_p ())
299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
300 "versioning for alias not supported for: "
301 "bad dist vector for ");
302 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
303 DR_REF (dra));
304 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
306 DR_REF (drb));
308 return false;
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
314 "versioning for alias required: "
315 "bad dist vector for ");
316 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
317 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
320 /* Add to list of ddrs that need to be tested at run-time. */
321 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
324 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
325 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
327 int dist = dist_v[loop_depth];
329 if (dump_enabled_p ())
330 dump_printf_loc (MSG_NOTE, vect_location,
331 "dependence distance = %d.", dist);
333 if (dist == 0)
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location,
338 "dependence distance == 0 between ");
339 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
340 dump_printf (MSG_NOTE, " and ");
341 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
344 /* When we perform grouped accesses and perform implicit CSE
345 by detecting equal accesses and doing disambiguation with
346 runtime alias tests like for
347 .. = a[i];
348 .. = a[i+1];
349 a[i] = ..;
350 a[i+1] = ..;
351 *p = ..;
352 .. = a[i];
353 .. = a[i+1];
354 where we will end up loading { a[i], a[i+1] } once, make
355 sure that inserting group loads before the first load and
356 stores after the last store will do the right thing. */
357 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
358 && GROUP_SAME_DR_STMT (stmtinfo_a))
359 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
360 && GROUP_SAME_DR_STMT (stmtinfo_b)))
362 gimple earlier_stmt;
363 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
364 if (DR_IS_WRITE
365 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
367 if (dump_enabled_p ())
368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
369 "READ_WRITE dependence in interleaving.");
370 return true;
374 continue;
377 if (dist > 0 && DDR_REVERSED_P (ddr))
379 /* If DDR_REVERSED_P the order of the data-refs in DDR was
380 reversed (to make distance vector positive), and the actual
381 distance is negative. */
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "dependence distance negative.");
385 continue;
388 if (abs (dist) >= 2
389 && abs (dist) < *max_vf)
391 /* The dependence distance requires reduction of the maximal
392 vectorization factor. */
393 *max_vf = abs (dist);
394 if (dump_enabled_p ())
395 dump_printf_loc (MSG_NOTE, vect_location,
396 "adjusting maximal vectorization factor to %i",
397 *max_vf);
400 if (abs (dist) >= *max_vf)
402 /* Dependence distance does not create dependence, as far as
403 vectorization is concerned, in this case. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_NOTE, vect_location,
406 "dependence distance >= VF.");
407 continue;
410 if (dump_enabled_p ())
412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
413 "not vectorized, possible dependence "
414 "between data-refs ");
415 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
416 dump_printf (MSG_NOTE, " and ");
417 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
420 return true;
423 return false;
426 /* Function vect_analyze_data_ref_dependences.
428 Examine all the data references in the loop, and make sure there do not
429 exist any data dependences between them. Set *MAX_VF according to
430 the maximum vectorization factor the data dependences allow. */
432 bool
433 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
435 unsigned int i;
436 struct data_dependence_relation *ddr;
438 if (dump_enabled_p ())
439 dump_printf_loc (MSG_NOTE, vect_location,
440 "=== vect_analyze_data_ref_dependences ===");
442 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
443 &LOOP_VINFO_DDRS (loop_vinfo),
444 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
445 return false;
447 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
448 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
449 return false;
451 return true;
455 /* Function vect_slp_analyze_data_ref_dependence.
457 Return TRUE if there (might) exist a dependence between a memory-reference
458 DRA and a memory-reference DRB. When versioning for alias may check a
459 dependence at run-time, return FALSE. Adjust *MAX_VF according to
460 the data dependence. */
462 static bool
463 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
465 struct data_reference *dra = DDR_A (ddr);
466 struct data_reference *drb = DDR_B (ddr);
468 /* We need to check dependences of statements marked as unvectorizable
469 as well, they still can prohibit vectorization. */
471 /* Independent data accesses. */
472 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
473 return false;
475 if (dra == drb)
476 return false;
478 /* Read-read is OK. */
479 if (DR_IS_READ (dra) && DR_IS_READ (drb))
480 return false;
482 /* If dra and drb are part of the same interleaving chain consider
483 them independent. */
484 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
485 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
486 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
487 return false;
489 /* Unknown data dependence. */
490 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
492 gimple earlier_stmt;
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "can't determine dependence between ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
499 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
500 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
503 /* We do not vectorize basic blocks with write-write dependencies. */
504 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
505 return true;
507 /* Check that it's not a load-after-store dependence. */
508 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
509 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
510 return true;
512 return false;
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location,
518 "determined dependence between ");
519 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
520 dump_printf (MSG_NOTE, " and ");
521 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
524 /* Do not vectorize basic blocks with write-write dependences. */
525 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
526 return true;
528 /* Check dependence between DRA and DRB for basic block vectorization.
529 If the accesses share same bases and offsets, we can compare their initial
530 constant offsets to decide whether they differ or not. In case of a read-
531 write dependence we check that the load is before the store to ensure that
532 vectorization will not change the order of the accesses. */
534 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
535 gimple earlier_stmt;
537 /* Check that the data-refs have same bases and offsets. If not, we can't
538 determine if they are dependent. */
539 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
540 || !dr_equal_offsets_p (dra, drb))
541 return true;
543 /* Check the types. */
544 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
545 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
547 if (type_size_a != type_size_b
548 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
549 TREE_TYPE (DR_REF (drb))))
550 return true;
552 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
553 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
555 /* Two different locations - no dependence. */
556 if (init_a != init_b)
557 return false;
559 /* We have a read-write dependence. Check that the load is before the store.
560 When we vectorize basic blocks, vector load can be only before
561 corresponding scalar load, and vector store can be only after its
562 corresponding scalar store. So the order of the acceses is preserved in
563 case the load is before the store. */
564 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
565 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
566 return false;
568 return true;
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the basic-block, and make sure there
575 do not exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
578 bool
579 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
581 struct data_dependence_relation *ddr;
582 unsigned int i;
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_NOTE, vect_location,
586 "=== vect_slp_analyze_data_ref_dependences ===");
588 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
589 &BB_VINFO_DDRS (bb_vinfo),
590 vNULL, true))
591 return false;
593 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
594 if (vect_slp_analyze_data_ref_dependence (ddr))
595 return false;
597 return true;
601 /* Function vect_compute_data_ref_alignment
603 Compute the misalignment of the data reference DR.
605 Output:
606 1. If during the misalignment computation it is found that the data reference
607 cannot be vectorized then false is returned.
608 2. DR_MISALIGNMENT (DR) is defined.
610 FOR NOW: No analysis is actually performed. Misalignment is calculated
611 only for trivial cases. TODO. */
613 static bool
614 vect_compute_data_ref_alignment (struct data_reference *dr)
616 gimple stmt = DR_STMT (dr);
617 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
618 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
619 struct loop *loop = NULL;
620 tree ref = DR_REF (dr);
621 tree vectype;
622 tree base, base_addr;
623 bool base_aligned;
624 tree misalign;
625 tree aligned_to, alignment;
627 if (dump_enabled_p ())
628 dump_printf_loc (MSG_NOTE, vect_location,
629 "vect_compute_data_ref_alignment:");
631 if (loop_vinfo)
632 loop = LOOP_VINFO_LOOP (loop_vinfo);
634 /* Initialize misalignment to unknown. */
635 SET_DR_MISALIGNMENT (dr, -1);
637 /* Strided loads perform only component accesses, misalignment information
638 is irrelevant for them. */
639 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
640 return true;
642 misalign = DR_INIT (dr);
643 aligned_to = DR_ALIGNED_TO (dr);
644 base_addr = DR_BASE_ADDRESS (dr);
645 vectype = STMT_VINFO_VECTYPE (stmt_info);
647 /* In case the dataref is in an inner-loop of the loop that is being
648 vectorized (LOOP), we use the base and misalignment information
649 relative to the outer-loop (LOOP). This is ok only if the misalignment
650 stays the same throughout the execution of the inner-loop, which is why
651 we have to check that the stride of the dataref in the inner-loop evenly
652 divides by the vector size. */
653 if (loop && nested_in_vect_loop_p (loop, stmt))
655 tree step = DR_STEP (dr);
656 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
658 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "inner step divides the vector-size.");
663 misalign = STMT_VINFO_DR_INIT (stmt_info);
664 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
665 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
667 else
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
671 "inner step doesn't divide the vector-size.");
672 misalign = NULL_TREE;
676 /* Similarly, if we're doing basic-block vectorization, we can only use
677 base and misalignment information relative to an innermost loop if the
678 misalignment stays the same throughout the execution of the loop.
679 As above, this is the case if the stride of the dataref evenly divides
680 by the vector size. */
681 if (!loop)
683 tree step = DR_STEP (dr);
684 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
686 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "SLP: step doesn't divide the vector-size.");
691 misalign = NULL_TREE;
695 base = build_fold_indirect_ref (base_addr);
696 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
698 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
699 || !misalign)
701 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Unknown alignment for access: ");
705 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
707 return true;
710 if ((DECL_P (base)
711 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
712 alignment) >= 0)
713 || (TREE_CODE (base_addr) == SSA_NAME
714 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
715 TREE_TYPE (base_addr)))),
716 alignment) >= 0)
717 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
718 base_aligned = true;
719 else
720 base_aligned = false;
722 if (!base_aligned)
724 /* Do not change the alignment of global variables here if
725 flag_section_anchors is enabled as we already generated
726 RTL for other functions. Most global variables should
727 have been aligned during the IPA increase_alignment pass. */
728 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
729 || (TREE_STATIC (base) && flag_section_anchors))
731 if (dump_enabled_p ())
733 dump_printf_loc (MSG_NOTE, vect_location,
734 "can't force alignment of ref: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
737 return true;
740 /* Force the alignment of the decl.
741 NOTE: This is the only change to the code we make during
742 the analysis phase, before deciding to vectorize the loop. */
743 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
746 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
749 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
750 DECL_USER_ALIGN (base) = 1;
753 /* At this point we assume that the base is aligned. */
754 gcc_assert (base_aligned
755 || (TREE_CODE (base) == VAR_DECL
756 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
758 /* If this is a backward running DR then first access in the larger
759 vectype actually is N-1 elements before the address in the DR.
760 Adjust misalign accordingly. */
761 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
763 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
764 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
765 otherwise we wouldn't be here. */
766 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
767 /* PLUS because DR_STEP was negative. */
768 misalign = size_binop (PLUS_EXPR, misalign, offset);
771 /* Modulo alignment. */
772 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
774 if (!host_integerp (misalign, 1))
776 /* Negative or overflowed misalignment value. */
777 if (dump_enabled_p ())
778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
779 "unexpected misalign value");
780 return false;
783 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
788 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
789 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
792 return true;
796 /* Function vect_compute_data_refs_alignment
798 Compute the misalignment of data references in the loop.
799 Return FALSE if a data reference is found that cannot be vectorized. */
801 static bool
802 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
803 bb_vec_info bb_vinfo)
805 vec<data_reference_p> datarefs;
806 struct data_reference *dr;
807 unsigned int i;
809 if (loop_vinfo)
810 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
811 else
812 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
814 FOR_EACH_VEC_ELT (datarefs, i, dr)
815 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
816 && !vect_compute_data_ref_alignment (dr))
818 if (bb_vinfo)
820 /* Mark unsupported statement as unvectorizable. */
821 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
822 continue;
824 else
825 return false;
828 return true;
832 /* Function vect_update_misalignment_for_peel
834 DR - the data reference whose misalignment is to be adjusted.
835 DR_PEEL - the data reference whose misalignment is being made
836 zero in the vector loop by the peel.
837 NPEEL - the number of iterations in the peel loop if the misalignment
838 of DR_PEEL is known at compile time. */
840 static void
841 vect_update_misalignment_for_peel (struct data_reference *dr,
842 struct data_reference *dr_peel, int npeel)
844 unsigned int i;
845 vec<dr_p> same_align_drs;
846 struct data_reference *current_dr;
847 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
848 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
849 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
850 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
852 /* For interleaved data accesses the step in the loop must be multiplied by
853 the size of the interleaving group. */
854 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
855 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
856 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
857 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
859 /* It can be assumed that the data refs with the same alignment as dr_peel
860 are aligned in the vector loop. */
861 same_align_drs
862 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
863 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
865 if (current_dr != dr)
866 continue;
867 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
868 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
869 SET_DR_MISALIGNMENT (dr, 0);
870 return;
873 if (known_alignment_for_access_p (dr)
874 && known_alignment_for_access_p (dr_peel))
876 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
877 int misal = DR_MISALIGNMENT (dr);
878 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
879 misal += negative ? -npeel * dr_size : npeel * dr_size;
880 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
881 SET_DR_MISALIGNMENT (dr, misal);
882 return;
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
887 SET_DR_MISALIGNMENT (dr, -1);
891 /* Function vect_verify_datarefs_alignment
893 Return TRUE if all data references in the loop can be
894 handled with respect to alignment. */
896 bool
897 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
899 vec<data_reference_p> datarefs;
900 struct data_reference *dr;
901 enum dr_alignment_support supportable_dr_alignment;
902 unsigned int i;
904 if (loop_vinfo)
905 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
906 else
907 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
909 FOR_EACH_VEC_ELT (datarefs, i, dr)
911 gimple stmt = DR_STMT (dr);
912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
914 if (!STMT_VINFO_RELEVANT_P (stmt_info))
915 continue;
917 /* For interleaving, only the alignment of the first access matters.
918 Skip statements marked as not vectorizable. */
919 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
920 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
921 || !STMT_VINFO_VECTORIZABLE (stmt_info))
922 continue;
924 /* Strided loads perform only component accesses, alignment is
925 irrelevant for them. */
926 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
927 continue;
929 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
930 if (!supportable_dr_alignment)
932 if (dump_enabled_p ())
934 if (DR_IS_READ (dr))
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "not vectorized: unsupported unaligned load.");
937 else
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "not vectorized: unsupported unaligned "
940 "store.");
942 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
943 DR_REF (dr));
945 return false;
947 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
948 dump_printf_loc (MSG_NOTE, vect_location,
949 "Vectorizing an unaligned access.");
951 return true;
954 /* Given an memory reference EXP return whether its alignment is less
955 than its size. */
957 static bool
958 not_size_aligned (tree exp)
960 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
961 return true;
963 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
964 > get_object_alignment (exp));
967 /* Function vector_alignment_reachable_p
969 Return true if vector alignment for DR is reachable by peeling
970 a few loop iterations. Return false otherwise. */
972 static bool
973 vector_alignment_reachable_p (struct data_reference *dr)
975 gimple stmt = DR_STMT (dr);
976 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
981 /* For interleaved access we peel only if number of iterations in
982 the prolog loop ({VF - misalignment}), is a multiple of the
983 number of the interleaved accesses. */
984 int elem_size, mis_in_elements;
985 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
987 /* FORNOW: handle only known alignment. */
988 if (!known_alignment_for_access_p (dr))
989 return false;
991 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
992 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
994 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
995 return false;
998 /* If misalignment is known at the compile time then allow peeling
999 only if natural alignment is reachable through peeling. */
1000 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1002 HOST_WIDE_INT elmsize =
1003 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_NOTE, vect_location,
1007 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1008 dump_printf (MSG_NOTE,
1009 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1011 if (DR_MISALIGNMENT (dr) % elmsize)
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "data size does not divide the misalignment.\n");
1016 return false;
1020 if (!known_alignment_for_access_p (dr))
1022 tree type = TREE_TYPE (DR_REF (dr));
1023 bool is_packed = not_size_aligned (DR_REF (dr));
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Unknown misalignment, is_packed = %d",is_packed);
1027 if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
1028 return true;
1029 else
1030 return false;
1033 return true;
1037 /* Calculate the cost of the memory access represented by DR. */
1039 static void
1040 vect_get_data_access_cost (struct data_reference *dr,
1041 unsigned int *inside_cost,
1042 unsigned int *outside_cost,
1043 stmt_vector_for_cost *body_cost_vec)
1045 gimple stmt = DR_STMT (dr);
1046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1047 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1048 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1049 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1050 int ncopies = vf / nunits;
1052 if (DR_IS_READ (dr))
1053 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1054 NULL, body_cost_vec, false);
1055 else
1056 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1058 if (dump_enabled_p ())
1059 dump_printf_loc (MSG_NOTE, vect_location,
1060 "vect_get_data_access_cost: inside_cost = %d, "
1061 "outside_cost = %d.", *inside_cost, *outside_cost);
1065 /* Insert DR into peeling hash table with NPEEL as key. */
1067 static void
1068 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1069 int npeel)
1071 struct _vect_peel_info elem, *slot;
1072 _vect_peel_info **new_slot;
1073 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1075 elem.npeel = npeel;
1076 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1077 if (slot)
1078 slot->count++;
1079 else
1081 slot = XNEW (struct _vect_peel_info);
1082 slot->npeel = npeel;
1083 slot->dr = dr;
1084 slot->count = 1;
1085 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1086 *new_slot = slot;
1089 if (!supportable_dr_alignment && !flag_vect_cost_model)
1090 slot->count += VECT_MAX_COST;
1094 /* Traverse peeling hash table to find peeling option that aligns maximum
1095 number of data accesses. */
1098 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1099 _vect_peel_extended_info *max)
1101 vect_peel_info elem = *slot;
1103 if (elem->count > max->peel_info.count
1104 || (elem->count == max->peel_info.count
1105 && max->peel_info.npeel > elem->npeel))
1107 max->peel_info.npeel = elem->npeel;
1108 max->peel_info.count = elem->count;
1109 max->peel_info.dr = elem->dr;
1112 return 1;
1116 /* Traverse peeling hash table and calculate cost for each peeling option.
1117 Find the one with the lowest cost. */
1120 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1121 _vect_peel_extended_info *min)
1123 vect_peel_info elem = *slot;
1124 int save_misalignment, dummy;
1125 unsigned int inside_cost = 0, outside_cost = 0, i;
1126 gimple stmt = DR_STMT (elem->dr);
1127 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1128 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1129 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1130 struct data_reference *dr;
1131 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1132 int single_iter_cost;
1134 prologue_cost_vec.create (2);
1135 body_cost_vec.create (2);
1136 epilogue_cost_vec.create (2);
1138 FOR_EACH_VEC_ELT (datarefs, i, dr)
1140 stmt = DR_STMT (dr);
1141 stmt_info = vinfo_for_stmt (stmt);
1142 /* For interleaving, only the alignment of the first access
1143 matters. */
1144 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1145 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1146 continue;
1148 save_misalignment = DR_MISALIGNMENT (dr);
1149 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1150 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1151 &body_cost_vec);
1152 SET_DR_MISALIGNMENT (dr, save_misalignment);
1155 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1156 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1157 &dummy, single_iter_cost,
1158 &prologue_cost_vec,
1159 &epilogue_cost_vec);
1161 /* Prologue and epilogue costs are added to the target model later.
1162 These costs depend only on the scalar iteration cost, the
1163 number of peeling iterations finally chosen, and the number of
1164 misaligned statements. So discard the information found here. */
1165 prologue_cost_vec.release ();
1166 epilogue_cost_vec.release ();
1168 if (inside_cost < min->inside_cost
1169 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1171 min->inside_cost = inside_cost;
1172 min->outside_cost = outside_cost;
1173 min->body_cost_vec.release ();
1174 min->body_cost_vec = body_cost_vec;
1175 min->peel_info.dr = elem->dr;
1176 min->peel_info.npeel = elem->npeel;
1178 else
1179 body_cost_vec.release ();
1181 return 1;
1185 /* Choose best peeling option by traversing peeling hash table and either
1186 choosing an option with the lowest cost (if cost model is enabled) or the
1187 option that aligns as many accesses as possible. */
1189 static struct data_reference *
1190 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1191 unsigned int *npeel,
1192 stmt_vector_for_cost *body_cost_vec)
1194 struct _vect_peel_extended_info res;
1196 res.peel_info.dr = NULL;
1197 res.body_cost_vec = stmt_vector_for_cost();
1199 if (flag_vect_cost_model)
1201 res.inside_cost = INT_MAX;
1202 res.outside_cost = INT_MAX;
1203 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1204 .traverse <_vect_peel_extended_info *,
1205 vect_peeling_hash_get_lowest_cost> (&res);
1207 else
1209 res.peel_info.count = 0;
1210 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1211 .traverse <_vect_peel_extended_info *,
1212 vect_peeling_hash_get_most_frequent> (&res);
1215 *npeel = res.peel_info.npeel;
1216 *body_cost_vec = res.body_cost_vec;
1217 return res.peel_info.dr;
1221 /* Function vect_enhance_data_refs_alignment
1223 This pass will use loop versioning and loop peeling in order to enhance
1224 the alignment of data references in the loop.
1226 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1227 original loop is to be vectorized. Any other loops that are created by
1228 the transformations performed in this pass - are not supposed to be
1229 vectorized. This restriction will be relaxed.
1231 This pass will require a cost model to guide it whether to apply peeling
1232 or versioning or a combination of the two. For example, the scheme that
1233 intel uses when given a loop with several memory accesses, is as follows:
1234 choose one memory access ('p') which alignment you want to force by doing
1235 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1236 other accesses are not necessarily aligned, or (2) use loop versioning to
1237 generate one loop in which all accesses are aligned, and another loop in
1238 which only 'p' is necessarily aligned.
1240 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1241 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1242 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1244 Devising a cost model is the most critical aspect of this work. It will
1245 guide us on which access to peel for, whether to use loop versioning, how
1246 many versions to create, etc. The cost model will probably consist of
1247 generic considerations as well as target specific considerations (on
1248 powerpc for example, misaligned stores are more painful than misaligned
1249 loads).
1251 Here are the general steps involved in alignment enhancements:
1253 -- original loop, before alignment analysis:
1254 for (i=0; i<N; i++){
1255 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1256 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1259 -- After vect_compute_data_refs_alignment:
1260 for (i=0; i<N; i++){
1261 x = q[i]; # DR_MISALIGNMENT(q) = 3
1262 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1265 -- Possibility 1: we do loop versioning:
1266 if (p is aligned) {
1267 for (i=0; i<N; i++){ # loop 1A
1268 x = q[i]; # DR_MISALIGNMENT(q) = 3
1269 p[i] = y; # DR_MISALIGNMENT(p) = 0
1272 else {
1273 for (i=0; i<N; i++){ # loop 1B
1274 x = q[i]; # DR_MISALIGNMENT(q) = 3
1275 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1279 -- Possibility 2: we do loop peeling:
1280 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1281 x = q[i];
1282 p[i] = y;
1284 for (i = 3; i < N; i++){ # loop 2A
1285 x = q[i]; # DR_MISALIGNMENT(q) = 0
1286 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1289 -- Possibility 3: combination of loop peeling and versioning:
1290 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1291 x = q[i];
1292 p[i] = y;
1294 if (p is aligned) {
1295 for (i = 3; i<N; i++){ # loop 3A
1296 x = q[i]; # DR_MISALIGNMENT(q) = 0
1297 p[i] = y; # DR_MISALIGNMENT(p) = 0
1300 else {
1301 for (i = 3; i<N; i++){ # loop 3B
1302 x = q[i]; # DR_MISALIGNMENT(q) = 0
1303 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1307 These loops are later passed to loop_transform to be vectorized. The
1308 vectorizer will use the alignment information to guide the transformation
1309 (whether to generate regular loads/stores, or with special handling for
1310 misalignment). */
1312 bool
1313 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1315 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1317 enum dr_alignment_support supportable_dr_alignment;
1318 struct data_reference *dr0 = NULL, *first_store = NULL;
1319 struct data_reference *dr;
1320 unsigned int i, j;
1321 bool do_peeling = false;
1322 bool do_versioning = false;
1323 bool stat;
1324 gimple stmt;
1325 stmt_vec_info stmt_info;
1326 int vect_versioning_for_alias_required;
1327 unsigned int npeel = 0;
1328 bool all_misalignments_unknown = true;
1329 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1330 unsigned possible_npeel_number = 1;
1331 tree vectype;
1332 unsigned int nelements, mis, same_align_drs_max = 0;
1333 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1335 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_NOTE, vect_location,
1337 "=== vect_enhance_data_refs_alignment ===");
1339 /* While cost model enhancements are expected in the future, the high level
1340 view of the code at this time is as follows:
1342 A) If there is a misaligned access then see if peeling to align
1343 this access can make all data references satisfy
1344 vect_supportable_dr_alignment. If so, update data structures
1345 as needed and return true.
1347 B) If peeling wasn't possible and there is a data reference with an
1348 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1349 then see if loop versioning checks can be used to make all data
1350 references satisfy vect_supportable_dr_alignment. If so, update
1351 data structures as needed and return true.
1353 C) If neither peeling nor versioning were successful then return false if
1354 any data reference does not satisfy vect_supportable_dr_alignment.
1356 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1358 Note, Possibility 3 above (which is peeling and versioning together) is not
1359 being done at this time. */
1361 /* (1) Peeling to force alignment. */
1363 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1364 Considerations:
1365 + How many accesses will become aligned due to the peeling
1366 - How many accesses will become unaligned due to the peeling,
1367 and the cost of misaligned accesses.
1368 - The cost of peeling (the extra runtime checks, the increase
1369 in code size). */
1371 FOR_EACH_VEC_ELT (datarefs, i, dr)
1373 stmt = DR_STMT (dr);
1374 stmt_info = vinfo_for_stmt (stmt);
1376 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1377 continue;
1379 /* For interleaving, only the alignment of the first access
1380 matters. */
1381 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1382 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1383 continue;
1385 /* For invariant accesses there is nothing to enhance. */
1386 if (integer_zerop (DR_STEP (dr)))
1387 continue;
1389 /* Strided loads perform only component accesses, alignment is
1390 irrelevant for them. */
1391 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1392 continue;
1394 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1395 do_peeling = vector_alignment_reachable_p (dr);
1396 if (do_peeling)
1398 if (known_alignment_for_access_p (dr))
1400 unsigned int npeel_tmp;
1401 bool negative = tree_int_cst_compare (DR_STEP (dr),
1402 size_zero_node) < 0;
1404 /* Save info about DR in the hash table. */
1405 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1406 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1408 vectype = STMT_VINFO_VECTYPE (stmt_info);
1409 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1410 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1411 TREE_TYPE (DR_REF (dr))));
1412 npeel_tmp = (negative
1413 ? (mis - nelements) : (nelements - mis))
1414 & (nelements - 1);
1416 /* For multiple types, it is possible that the bigger type access
1417 will have more than one peeling option. E.g., a loop with two
1418 types: one of size (vector size / 4), and the other one of
1419 size (vector size / 8). Vectorization factor will 8. If both
1420 access are misaligned by 3, the first one needs one scalar
1421 iteration to be aligned, and the second one needs 5. But the
1422 the first one will be aligned also by peeling 5 scalar
1423 iterations, and in that case both accesses will be aligned.
1424 Hence, except for the immediate peeling amount, we also want
1425 to try to add full vector size, while we don't exceed
1426 vectorization factor.
1427 We do this automtically for cost model, since we calculate cost
1428 for every peeling option. */
1429 if (!flag_vect_cost_model)
1430 possible_npeel_number = vf /nelements;
1432 /* Handle the aligned case. We may decide to align some other
1433 access, making DR unaligned. */
1434 if (DR_MISALIGNMENT (dr) == 0)
1436 npeel_tmp = 0;
1437 if (!flag_vect_cost_model)
1438 possible_npeel_number++;
1441 for (j = 0; j < possible_npeel_number; j++)
1443 gcc_assert (npeel_tmp <= vf);
1444 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1445 npeel_tmp += nelements;
1448 all_misalignments_unknown = false;
1449 /* Data-ref that was chosen for the case that all the
1450 misalignments are unknown is not relevant anymore, since we
1451 have a data-ref with known alignment. */
1452 dr0 = NULL;
1454 else
1456 /* If we don't know any misalignment values, we prefer
1457 peeling for data-ref that has the maximum number of data-refs
1458 with the same alignment, unless the target prefers to align
1459 stores over load. */
1460 if (all_misalignments_unknown)
1462 unsigned same_align_drs
1463 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1464 if (!dr0
1465 || same_align_drs_max < same_align_drs)
1467 same_align_drs_max = same_align_drs;
1468 dr0 = dr;
1470 /* For data-refs with the same number of related
1471 accesses prefer the one where the misalign
1472 computation will be invariant in the outermost loop. */
1473 else if (same_align_drs_max == same_align_drs)
1475 struct loop *ivloop0, *ivloop;
1476 ivloop0 = outermost_invariant_loop_for_expr
1477 (loop, DR_BASE_ADDRESS (dr0));
1478 ivloop = outermost_invariant_loop_for_expr
1479 (loop, DR_BASE_ADDRESS (dr));
1480 if ((ivloop && !ivloop0)
1481 || (ivloop && ivloop0
1482 && flow_loop_nested_p (ivloop, ivloop0)))
1483 dr0 = dr;
1486 if (!first_store && DR_IS_WRITE (dr))
1487 first_store = dr;
1490 /* If there are both known and unknown misaligned accesses in the
1491 loop, we choose peeling amount according to the known
1492 accesses. */
1493 if (!supportable_dr_alignment)
1495 dr0 = dr;
1496 if (!first_store && DR_IS_WRITE (dr))
1497 first_store = dr;
1501 else
1503 if (!aligned_access_p (dr))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "vector alignment may not be reachable");
1508 break;
1513 vect_versioning_for_alias_required
1514 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1516 /* Temporarily, if versioning for alias is required, we disable peeling
1517 until we support peeling and versioning. Often peeling for alignment
1518 will require peeling for loop-bound, which in turn requires that we
1519 know how to adjust the loop ivs after the loop. */
1520 if (vect_versioning_for_alias_required
1521 || !vect_can_advance_ivs_p (loop_vinfo)
1522 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1523 do_peeling = false;
1525 if (do_peeling && all_misalignments_unknown
1526 && vect_supportable_dr_alignment (dr0, false))
1529 /* Check if the target requires to prefer stores over loads, i.e., if
1530 misaligned stores are more expensive than misaligned loads (taking
1531 drs with same alignment into account). */
1532 if (first_store && DR_IS_READ (dr0))
1534 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1535 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1536 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1537 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1538 stmt_vector_for_cost dummy;
1539 dummy.create (2);
1541 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1542 &dummy);
1543 vect_get_data_access_cost (first_store, &store_inside_cost,
1544 &store_outside_cost, &dummy);
1546 dummy.release ();
1548 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1549 aligning the load DR0). */
1550 load_inside_penalty = store_inside_cost;
1551 load_outside_penalty = store_outside_cost;
1552 for (i = 0;
1553 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1554 DR_STMT (first_store))).iterate (i, &dr);
1555 i++)
1556 if (DR_IS_READ (dr))
1558 load_inside_penalty += load_inside_cost;
1559 load_outside_penalty += load_outside_cost;
1561 else
1563 load_inside_penalty += store_inside_cost;
1564 load_outside_penalty += store_outside_cost;
1567 /* Calculate the penalty for leaving DR0 unaligned (by
1568 aligning the FIRST_STORE). */
1569 store_inside_penalty = load_inside_cost;
1570 store_outside_penalty = load_outside_cost;
1571 for (i = 0;
1572 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1573 DR_STMT (dr0))).iterate (i, &dr);
1574 i++)
1575 if (DR_IS_READ (dr))
1577 store_inside_penalty += load_inside_cost;
1578 store_outside_penalty += load_outside_cost;
1580 else
1582 store_inside_penalty += store_inside_cost;
1583 store_outside_penalty += store_outside_cost;
1586 if (load_inside_penalty > store_inside_penalty
1587 || (load_inside_penalty == store_inside_penalty
1588 && load_outside_penalty > store_outside_penalty))
1589 dr0 = first_store;
1592 /* In case there are only loads with different unknown misalignments, use
1593 peeling only if it may help to align other accesses in the loop. */
1594 if (!first_store
1595 && !STMT_VINFO_SAME_ALIGN_REFS (
1596 vinfo_for_stmt (DR_STMT (dr0))).length ()
1597 && vect_supportable_dr_alignment (dr0, false)
1598 != dr_unaligned_supported)
1599 do_peeling = false;
1602 if (do_peeling && !dr0)
1604 /* Peeling is possible, but there is no data access that is not supported
1605 unless aligned. So we try to choose the best possible peeling. */
1607 /* We should get here only if there are drs with known misalignment. */
1608 gcc_assert (!all_misalignments_unknown);
1610 /* Choose the best peeling from the hash table. */
1611 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1612 &body_cost_vec);
1613 if (!dr0 || !npeel)
1614 do_peeling = false;
1617 if (do_peeling)
1619 stmt = DR_STMT (dr0);
1620 stmt_info = vinfo_for_stmt (stmt);
1621 vectype = STMT_VINFO_VECTYPE (stmt_info);
1622 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1624 if (known_alignment_for_access_p (dr0))
1626 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1627 size_zero_node) < 0;
1628 if (!npeel)
1630 /* Since it's known at compile time, compute the number of
1631 iterations in the peeled loop (the peeling factor) for use in
1632 updating DR_MISALIGNMENT values. The peeling factor is the
1633 vectorization factor minus the misalignment as an element
1634 count. */
1635 mis = DR_MISALIGNMENT (dr0);
1636 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1637 npeel = ((negative ? mis - nelements : nelements - mis)
1638 & (nelements - 1));
1641 /* For interleaved data access every iteration accesses all the
1642 members of the group, therefore we divide the number of iterations
1643 by the group size. */
1644 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1645 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1646 npeel /= GROUP_SIZE (stmt_info);
1648 if (dump_enabled_p ())
1649 dump_printf_loc (MSG_NOTE, vect_location,
1650 "Try peeling by %d", npeel);
1653 /* Ensure that all data refs can be vectorized after the peel. */
1654 FOR_EACH_VEC_ELT (datarefs, i, dr)
1656 int save_misalignment;
1658 if (dr == dr0)
1659 continue;
1661 stmt = DR_STMT (dr);
1662 stmt_info = vinfo_for_stmt (stmt);
1663 /* For interleaving, only the alignment of the first access
1664 matters. */
1665 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1666 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1667 continue;
1669 /* Strided loads perform only component accesses, alignment is
1670 irrelevant for them. */
1671 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1672 continue;
1674 save_misalignment = DR_MISALIGNMENT (dr);
1675 vect_update_misalignment_for_peel (dr, dr0, npeel);
1676 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1677 SET_DR_MISALIGNMENT (dr, save_misalignment);
1679 if (!supportable_dr_alignment)
1681 do_peeling = false;
1682 break;
1686 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1688 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1689 if (!stat)
1690 do_peeling = false;
1691 else
1693 body_cost_vec.release ();
1694 return stat;
1698 if (do_peeling)
1700 stmt_info_for_cost *si;
1701 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1703 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1704 If the misalignment of DR_i is identical to that of dr0 then set
1705 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1706 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1707 by the peeling factor times the element size of DR_i (MOD the
1708 vectorization factor times the size). Otherwise, the
1709 misalignment of DR_i must be set to unknown. */
1710 FOR_EACH_VEC_ELT (datarefs, i, dr)
1711 if (dr != dr0)
1712 vect_update_misalignment_for_peel (dr, dr0, npeel);
1714 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1715 if (npeel)
1716 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1717 else
1718 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1719 SET_DR_MISALIGNMENT (dr0, 0);
1720 if (dump_enabled_p ())
1722 dump_printf_loc (MSG_NOTE, vect_location,
1723 "Alignment of access forced using peeling.");
1724 dump_printf_loc (MSG_NOTE, vect_location,
1725 "Peeling for alignment will be applied.");
1727 /* We've delayed passing the inside-loop peeling costs to the
1728 target cost model until we were sure peeling would happen.
1729 Do so now. */
1730 if (body_cost_vec.exists ())
1732 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1734 struct _stmt_vec_info *stmt_info
1735 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1736 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1737 si->misalign, vect_body);
1739 body_cost_vec.release ();
1742 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1743 gcc_assert (stat);
1744 return stat;
1748 body_cost_vec.release ();
1750 /* (2) Versioning to force alignment. */
1752 /* Try versioning if:
1753 1) flag_tree_vect_loop_version is TRUE
1754 2) optimize loop for speed
1755 3) there is at least one unsupported misaligned data ref with an unknown
1756 misalignment, and
1757 4) all misaligned data refs with a known misalignment are supported, and
1758 5) the number of runtime alignment checks is within reason. */
1760 do_versioning =
1761 flag_tree_vect_loop_version
1762 && optimize_loop_nest_for_speed_p (loop)
1763 && (!loop->inner); /* FORNOW */
1765 if (do_versioning)
1767 FOR_EACH_VEC_ELT (datarefs, i, dr)
1769 stmt = DR_STMT (dr);
1770 stmt_info = vinfo_for_stmt (stmt);
1772 /* For interleaving, only the alignment of the first access
1773 matters. */
1774 if (aligned_access_p (dr)
1775 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1776 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1777 continue;
1779 /* Strided loads perform only component accesses, alignment is
1780 irrelevant for them. */
1781 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1782 continue;
1784 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1786 if (!supportable_dr_alignment)
1788 gimple stmt;
1789 int mask;
1790 tree vectype;
1792 if (known_alignment_for_access_p (dr)
1793 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1794 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1796 do_versioning = false;
1797 break;
1800 stmt = DR_STMT (dr);
1801 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1802 gcc_assert (vectype);
1804 /* The rightmost bits of an aligned address must be zeros.
1805 Construct the mask needed for this test. For example,
1806 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1807 mask must be 15 = 0xf. */
1808 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1810 /* FORNOW: use the same mask to test all potentially unaligned
1811 references in the loop. The vectorizer currently supports
1812 a single vector size, see the reference to
1813 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1814 vectorization factor is computed. */
1815 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1816 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1817 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1818 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1819 DR_STMT (dr));
1823 /* Versioning requires at least one misaligned data reference. */
1824 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1825 do_versioning = false;
1826 else if (!do_versioning)
1827 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1830 if (do_versioning)
1832 vec<gimple> may_misalign_stmts
1833 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1834 gimple stmt;
1836 /* It can now be assumed that the data references in the statements
1837 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1838 of the loop being vectorized. */
1839 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1841 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1842 dr = STMT_VINFO_DATA_REF (stmt_info);
1843 SET_DR_MISALIGNMENT (dr, 0);
1844 if (dump_enabled_p ())
1845 dump_printf_loc (MSG_NOTE, vect_location,
1846 "Alignment of access forced using versioning.");
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE, vect_location,
1851 "Versioning for alignment will be applied.");
1853 /* Peeling and versioning can't be done together at this time. */
1854 gcc_assert (! (do_peeling && do_versioning));
1856 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1857 gcc_assert (stat);
1858 return stat;
1861 /* This point is reached if neither peeling nor versioning is being done. */
1862 gcc_assert (! (do_peeling || do_versioning));
1864 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1865 return stat;
1869 /* Function vect_find_same_alignment_drs.
1871 Update group and alignment relations according to the chosen
1872 vectorization factor. */
1874 static void
1875 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1876 loop_vec_info loop_vinfo)
1878 unsigned int i;
1879 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1880 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1881 struct data_reference *dra = DDR_A (ddr);
1882 struct data_reference *drb = DDR_B (ddr);
1883 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1884 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1885 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1886 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1887 lambda_vector dist_v;
1888 unsigned int loop_depth;
1890 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1891 return;
1893 if (dra == drb)
1894 return;
1896 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1897 return;
1899 /* Loop-based vectorization and known data dependence. */
1900 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1901 return;
1903 /* Data-dependence analysis reports a distance vector of zero
1904 for data-references that overlap only in the first iteration
1905 but have different sign step (see PR45764).
1906 So as a sanity check require equal DR_STEP. */
1907 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1908 return;
1910 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1911 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1913 int dist = dist_v[loop_depth];
1915 if (dump_enabled_p ())
1916 dump_printf_loc (MSG_NOTE, vect_location,
1917 "dependence distance = %d.", dist);
1919 /* Same loop iteration. */
1920 if (dist == 0
1921 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1923 /* Two references with distance zero have the same alignment. */
1924 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1925 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1926 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_NOTE, vect_location,
1929 "accesses have the same alignment.");
1930 dump_printf (MSG_NOTE,
1931 "dependence distance modulo vf == 0 between ");
1932 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1933 dump_printf (MSG_NOTE, " and ");
1934 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1941 /* Function vect_analyze_data_refs_alignment
1943 Analyze the alignment of the data-references in the loop.
1944 Return FALSE if a data reference is found that cannot be vectorized. */
1946 bool
1947 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1948 bb_vec_info bb_vinfo)
1950 if (dump_enabled_p ())
1951 dump_printf_loc (MSG_NOTE, vect_location,
1952 "=== vect_analyze_data_refs_alignment ===");
1954 /* Mark groups of data references with same alignment using
1955 data dependence information. */
1956 if (loop_vinfo)
1958 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1959 struct data_dependence_relation *ddr;
1960 unsigned int i;
1962 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1963 vect_find_same_alignment_drs (ddr, loop_vinfo);
1966 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1968 if (dump_enabled_p ())
1969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1970 "not vectorized: can't calculate alignment "
1971 "for data ref.");
1972 return false;
1975 return true;
1979 /* Analyze groups of accesses: check that DR belongs to a group of
1980 accesses of legal size, step, etc. Detect gaps, single element
1981 interleaving, and other special cases. Set grouped access info.
1982 Collect groups of strided stores for further use in SLP analysis. */
1984 static bool
1985 vect_analyze_group_access (struct data_reference *dr)
1987 tree step = DR_STEP (dr);
1988 tree scalar_type = TREE_TYPE (DR_REF (dr));
1989 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1990 gimple stmt = DR_STMT (dr);
1991 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1992 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1993 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1994 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1995 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1996 bool slp_impossible = false;
1997 struct loop *loop = NULL;
1999 if (loop_vinfo)
2000 loop = LOOP_VINFO_LOOP (loop_vinfo);
2002 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2003 size of the interleaving group (including gaps). */
2004 groupsize = absu_hwi (dr_step) / type_size;
2006 /* Not consecutive access is possible only if it is a part of interleaving. */
2007 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2009 /* Check if it this DR is a part of interleaving, and is a single
2010 element of the group that is accessed in the loop. */
2012 /* Gaps are supported only for loads. STEP must be a multiple of the type
2013 size. The size of the group must be a power of 2. */
2014 if (DR_IS_READ (dr)
2015 && (dr_step % type_size) == 0
2016 && groupsize > 0
2017 && exact_log2 (groupsize) != -1)
2019 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2020 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2021 if (dump_enabled_p ())
2023 dump_printf_loc (MSG_NOTE, vect_location,
2024 "Detected single element interleaving ");
2025 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2026 dump_printf (MSG_NOTE, " step ");
2027 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2030 if (loop_vinfo)
2032 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "Data access with gaps requires scalar "
2035 "epilogue loop");
2036 if (loop->inner)
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040 "Peeling for outer loop is not"
2041 " supported");
2042 return false;
2045 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2048 return true;
2051 if (dump_enabled_p ())
2053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2054 "not consecutive access ");
2055 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2058 if (bb_vinfo)
2060 /* Mark the statement as unvectorizable. */
2061 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2062 return true;
2065 return false;
2068 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2070 /* First stmt in the interleaving chain. Check the chain. */
2071 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2072 struct data_reference *data_ref = dr;
2073 unsigned int count = 1;
2074 tree prev_init = DR_INIT (data_ref);
2075 gimple prev = stmt;
2076 HOST_WIDE_INT diff, gaps = 0;
2077 unsigned HOST_WIDE_INT count_in_bytes;
2079 while (next)
2081 /* Skip same data-refs. In case that two or more stmts share
2082 data-ref (supported only for loads), we vectorize only the first
2083 stmt, and the rest get their vectorized loads from the first
2084 one. */
2085 if (!tree_int_cst_compare (DR_INIT (data_ref),
2086 DR_INIT (STMT_VINFO_DATA_REF (
2087 vinfo_for_stmt (next)))))
2089 if (DR_IS_WRITE (data_ref))
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2093 "Two store stmts share the same dr.");
2094 return false;
2097 /* For load use the same data-ref load. */
2098 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2100 prev = next;
2101 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2102 continue;
2105 prev = next;
2106 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2108 /* All group members have the same STEP by construction. */
2109 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2111 /* Check that the distance between two accesses is equal to the type
2112 size. Otherwise, we have gaps. */
2113 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2114 - TREE_INT_CST_LOW (prev_init)) / type_size;
2115 if (diff != 1)
2117 /* FORNOW: SLP of accesses with gaps is not supported. */
2118 slp_impossible = true;
2119 if (DR_IS_WRITE (data_ref))
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123 "interleaved store with gaps");
2124 return false;
2127 gaps += diff - 1;
2130 last_accessed_element += diff;
2132 /* Store the gap from the previous member of the group. If there is no
2133 gap in the access, GROUP_GAP is always 1. */
2134 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2136 prev_init = DR_INIT (data_ref);
2137 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2138 /* Count the number of data-refs in the chain. */
2139 count++;
2142 /* COUNT is the number of accesses found, we multiply it by the size of
2143 the type to get COUNT_IN_BYTES. */
2144 count_in_bytes = type_size * count;
2146 /* Check that the size of the interleaving (including gaps) is not
2147 greater than STEP. */
2148 if (dr_step != 0
2149 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2151 if (dump_enabled_p ())
2153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2154 "interleaving size is greater than step for ");
2155 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2157 return false;
2160 /* Check that the size of the interleaving is equal to STEP for stores,
2161 i.e., that there are no gaps. */
2162 if (dr_step != 0
2163 && absu_hwi (dr_step) != count_in_bytes)
2165 if (DR_IS_READ (dr))
2167 slp_impossible = true;
2168 /* There is a gap after the last load in the group. This gap is a
2169 difference between the groupsize and the number of elements.
2170 When there is no gap, this difference should be 0. */
2171 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2173 else
2175 if (dump_enabled_p ())
2176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2177 "interleaved store with gaps");
2178 return false;
2182 /* Check that STEP is a multiple of type size. */
2183 if (dr_step != 0
2184 && (dr_step % type_size) != 0)
2186 if (dump_enabled_p ())
2188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189 "step is not a multiple of type size: step ");
2190 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2191 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2192 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2193 TYPE_SIZE_UNIT (scalar_type));
2195 return false;
2198 if (groupsize == 0)
2199 groupsize = count;
2201 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2202 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_NOTE, vect_location,
2204 "Detected interleaving of size %d", (int)groupsize);
2206 /* SLP: create an SLP data structure for every interleaving group of
2207 stores for further analysis in vect_analyse_slp. */
2208 if (DR_IS_WRITE (dr) && !slp_impossible)
2210 if (loop_vinfo)
2211 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2212 if (bb_vinfo)
2213 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2216 /* There is a gap in the end of the group. */
2217 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2219 if (dump_enabled_p ())
2220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2221 "Data access with gaps requires scalar "
2222 "epilogue loop");
2223 if (loop->inner)
2225 if (dump_enabled_p ())
2226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2227 "Peeling for outer loop is not supported");
2228 return false;
2231 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2235 return true;
2239 /* Analyze the access pattern of the data-reference DR.
2240 In case of non-consecutive accesses call vect_analyze_group_access() to
2241 analyze groups of accesses. */
2243 static bool
2244 vect_analyze_data_ref_access (struct data_reference *dr)
2246 tree step = DR_STEP (dr);
2247 tree scalar_type = TREE_TYPE (DR_REF (dr));
2248 gimple stmt = DR_STMT (dr);
2249 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2250 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2251 struct loop *loop = NULL;
2253 if (loop_vinfo)
2254 loop = LOOP_VINFO_LOOP (loop_vinfo);
2256 if (loop_vinfo && !step)
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2260 "bad data-ref access in loop");
2261 return false;
2264 /* Allow invariant loads in loops. */
2265 if (loop_vinfo && integer_zerop (step))
2267 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2268 return DR_IS_READ (dr);
2271 if (loop && nested_in_vect_loop_p (loop, stmt))
2273 /* Interleaved accesses are not yet supported within outer-loop
2274 vectorization for references in the inner-loop. */
2275 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2277 /* For the rest of the analysis we use the outer-loop step. */
2278 step = STMT_VINFO_DR_STEP (stmt_info);
2279 if (integer_zerop (step))
2281 if (dump_enabled_p ())
2282 dump_printf_loc (MSG_NOTE, vect_location,
2283 "zero step in outer loop.");
2284 if (DR_IS_READ (dr))
2285 return true;
2286 else
2287 return false;
2291 /* Consecutive? */
2292 if (TREE_CODE (step) == INTEGER_CST)
2294 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2295 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2296 || (dr_step < 0
2297 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2299 /* Mark that it is not interleaving. */
2300 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2301 return true;
2305 if (loop && nested_in_vect_loop_p (loop, stmt))
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_NOTE, vect_location,
2309 "grouped access in outer loop.");
2310 return false;
2313 /* Assume this is a DR handled by non-constant strided load case. */
2314 if (TREE_CODE (step) != INTEGER_CST)
2315 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2317 /* Not consecutive access - check if it's a part of interleaving group. */
2318 return vect_analyze_group_access (dr);
2321 /* Compare two data-references DRA and DRB to group them into chunks
2322 suitable for grouping. */
2324 static int
2325 dr_group_sort_cmp (const void *dra_, const void *drb_)
2327 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2328 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2329 hashval_t h1, h2;
2330 int cmp;
2332 /* Stabilize sort. */
2333 if (dra == drb)
2334 return 0;
2336 /* Ordering of DRs according to base. */
2337 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2339 h1 = iterative_hash_expr (DR_BASE_ADDRESS (dra), 0);
2340 h2 = iterative_hash_expr (DR_BASE_ADDRESS (drb), 0);
2341 if (h1 != h2)
2342 return h1 < h2 ? -1 : 1;
2345 /* And according to DR_OFFSET. */
2346 if (!dr_equal_offsets_p (dra, drb))
2348 h1 = iterative_hash_expr (DR_OFFSET (dra), 0);
2349 h2 = iterative_hash_expr (DR_OFFSET (drb), 0);
2350 if (h1 != h2)
2351 return h1 < h2 ? -1 : 1;
2354 /* Put reads before writes. */
2355 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2356 return DR_IS_READ (dra) ? -1 : 1;
2358 /* Then sort after access size. */
2359 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2360 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2362 h1 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 0);
2363 h2 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0);
2364 if (h1 != h2)
2365 return h1 < h2 ? -1 : 1;
2368 /* And after step. */
2369 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2371 h1 = iterative_hash_expr (DR_STEP (dra), 0);
2372 h2 = iterative_hash_expr (DR_STEP (drb), 0);
2373 if (h1 != h2)
2374 return h1 < h2 ? -1 : 1;
2377 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2378 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2379 if (cmp == 0)
2380 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2381 return cmp;
2384 /* Function vect_analyze_data_ref_accesses.
2386 Analyze the access pattern of all the data references in the loop.
2388 FORNOW: the only access pattern that is considered vectorizable is a
2389 simple step 1 (consecutive) access.
2391 FORNOW: handle only arrays and pointer accesses. */
2393 bool
2394 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2396 unsigned int i;
2397 vec<data_reference_p> datarefs;
2398 struct data_reference *dr;
2400 if (dump_enabled_p ())
2401 dump_printf_loc (MSG_NOTE, vect_location,
2402 "=== vect_analyze_data_ref_accesses ===");
2404 if (loop_vinfo)
2405 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2406 else
2407 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2409 if (datarefs.is_empty ())
2410 return true;
2412 /* Sort the array of datarefs to make building the interleaving chains
2413 linear. */
2414 qsort (datarefs.address(), datarefs.length (),
2415 sizeof (data_reference_p), dr_group_sort_cmp);
2417 /* Build the interleaving chains. */
2418 for (i = 0; i < datarefs.length () - 1;)
2420 data_reference_p dra = datarefs[i];
2421 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2422 stmt_vec_info lastinfo = NULL;
2423 for (i = i + 1; i < datarefs.length (); ++i)
2425 data_reference_p drb = datarefs[i];
2426 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2428 /* ??? Imperfect sorting (non-compatible types, non-modulo
2429 accesses, same accesses) can lead to a group to be artificially
2430 split here as we don't just skip over those. If it really
2431 matters we can push those to a worklist and re-iterate
2432 over them. The we can just skip ahead to the next DR here. */
2434 /* Check that the data-refs have same first location (except init)
2435 and they are both either store or load (not load and store). */
2436 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2437 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2438 DR_BASE_ADDRESS (drb), 0)
2439 || !dr_equal_offsets_p (dra, drb))
2440 break;
2442 /* Check that the data-refs have the same constant size and step. */
2443 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2444 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2445 if (!host_integerp (sza, 1)
2446 || !host_integerp (szb, 1)
2447 || !tree_int_cst_equal (sza, szb)
2448 || !host_integerp (DR_STEP (dra), 0)
2449 || !host_integerp (DR_STEP (drb), 0)
2450 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2451 break;
2453 /* Do not place the same access in the interleaving chain twice. */
2454 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2455 break;
2457 /* Check the types are compatible.
2458 ??? We don't distinguish this during sorting. */
2459 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2460 TREE_TYPE (DR_REF (drb))))
2461 break;
2463 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2464 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2465 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2466 gcc_assert (init_a < init_b);
2468 /* If init_b == init_a + the size of the type * k, we have an
2469 interleaving, and DRA is accessed before DRB. */
2470 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2471 if ((init_b - init_a) % type_size_a != 0)
2472 break;
2474 /* The step (if not zero) is greater than the difference between
2475 data-refs' inits. This splits groups into suitable sizes. */
2476 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2477 if (step != 0 && step <= (init_b - init_a))
2478 break;
2480 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE, vect_location,
2483 "Detected interleaving ");
2484 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2485 dump_printf (MSG_NOTE, " and ");
2486 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2489 /* Link the found element into the group list. */
2490 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2492 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2493 lastinfo = stmtinfo_a;
2495 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2496 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2497 lastinfo = stmtinfo_b;
2501 FOR_EACH_VEC_ELT (datarefs, i, dr)
2502 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2503 && !vect_analyze_data_ref_access (dr))
2505 if (dump_enabled_p ())
2506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2507 "not vectorized: complicated access pattern.");
2509 if (bb_vinfo)
2511 /* Mark the statement as not vectorizable. */
2512 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2513 continue;
2515 else
2516 return false;
2519 return true;
2522 /* Function vect_prune_runtime_alias_test_list.
2524 Prune a list of ddrs to be tested at run-time by versioning for alias.
2525 Return FALSE if resulting list of ddrs is longer then allowed by
2526 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2528 bool
2529 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2531 vec<ddr_p> ddrs =
2532 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2533 unsigned i, j;
2535 if (dump_enabled_p ())
2536 dump_printf_loc (MSG_NOTE, vect_location,
2537 "=== vect_prune_runtime_alias_test_list ===");
2539 for (i = 0; i < ddrs.length (); )
2541 bool found;
2542 ddr_p ddr_i;
2544 ddr_i = ddrs[i];
2545 found = false;
2547 for (j = 0; j < i; j++)
2549 ddr_p ddr_j = ddrs[j];
2551 if (vect_vfa_range_equal (ddr_i, ddr_j))
2553 if (dump_enabled_p ())
2555 dump_printf_loc (MSG_NOTE, vect_location,
2556 "found equal ranges ");
2557 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2558 dump_printf (MSG_NOTE, ", ");
2559 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2560 dump_printf (MSG_NOTE, " and ");
2561 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2562 dump_printf (MSG_NOTE, ", ");
2563 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2565 found = true;
2566 break;
2570 if (found)
2572 ddrs.ordered_remove (i);
2573 continue;
2575 i++;
2578 if (ddrs.length () >
2579 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2581 if (dump_enabled_p ())
2583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2584 "disable versioning for alias - max number of "
2585 "generated checks exceeded.");
2588 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2590 return false;
2593 return true;
2596 /* Check whether a non-affine read in stmt is suitable for gather load
2597 and if so, return a builtin decl for that operation. */
2599 tree
2600 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2601 tree *offp, int *scalep)
2603 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2604 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2605 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2606 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2607 tree offtype = NULL_TREE;
2608 tree decl, base, off;
2609 enum machine_mode pmode;
2610 int punsignedp, pvolatilep;
2612 /* The gather builtins need address of the form
2613 loop_invariant + vector * {1, 2, 4, 8}
2615 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2616 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2617 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2618 multiplications and additions in it. To get a vector, we need
2619 a single SSA_NAME that will be defined in the loop and will
2620 contain everything that is not loop invariant and that can be
2621 vectorized. The following code attempts to find such a preexistng
2622 SSA_NAME OFF and put the loop invariants into a tree BASE
2623 that can be gimplified before the loop. */
2624 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2625 &pmode, &punsignedp, &pvolatilep, false);
2626 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2628 if (TREE_CODE (base) == MEM_REF)
2630 if (!integer_zerop (TREE_OPERAND (base, 1)))
2632 if (off == NULL_TREE)
2634 double_int moff = mem_ref_offset (base);
2635 off = double_int_to_tree (sizetype, moff);
2637 else
2638 off = size_binop (PLUS_EXPR, off,
2639 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2641 base = TREE_OPERAND (base, 0);
2643 else
2644 base = build_fold_addr_expr (base);
2646 if (off == NULL_TREE)
2647 off = size_zero_node;
2649 /* If base is not loop invariant, either off is 0, then we start with just
2650 the constant offset in the loop invariant BASE and continue with base
2651 as OFF, otherwise give up.
2652 We could handle that case by gimplifying the addition of base + off
2653 into some SSA_NAME and use that as off, but for now punt. */
2654 if (!expr_invariant_in_loop_p (loop, base))
2656 if (!integer_zerop (off))
2657 return NULL_TREE;
2658 off = base;
2659 base = size_int (pbitpos / BITS_PER_UNIT);
2661 /* Otherwise put base + constant offset into the loop invariant BASE
2662 and continue with OFF. */
2663 else
2665 base = fold_convert (sizetype, base);
2666 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2669 /* OFF at this point may be either a SSA_NAME or some tree expression
2670 from get_inner_reference. Try to peel off loop invariants from it
2671 into BASE as long as possible. */
2672 STRIP_NOPS (off);
2673 while (offtype == NULL_TREE)
2675 enum tree_code code;
2676 tree op0, op1, add = NULL_TREE;
2678 if (TREE_CODE (off) == SSA_NAME)
2680 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2682 if (expr_invariant_in_loop_p (loop, off))
2683 return NULL_TREE;
2685 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2686 break;
2688 op0 = gimple_assign_rhs1 (def_stmt);
2689 code = gimple_assign_rhs_code (def_stmt);
2690 op1 = gimple_assign_rhs2 (def_stmt);
2692 else
2694 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2695 return NULL_TREE;
2696 code = TREE_CODE (off);
2697 extract_ops_from_tree (off, &code, &op0, &op1);
2699 switch (code)
2701 case POINTER_PLUS_EXPR:
2702 case PLUS_EXPR:
2703 if (expr_invariant_in_loop_p (loop, op0))
2705 add = op0;
2706 off = op1;
2707 do_add:
2708 add = fold_convert (sizetype, add);
2709 if (scale != 1)
2710 add = size_binop (MULT_EXPR, add, size_int (scale));
2711 base = size_binop (PLUS_EXPR, base, add);
2712 continue;
2714 if (expr_invariant_in_loop_p (loop, op1))
2716 add = op1;
2717 off = op0;
2718 goto do_add;
2720 break;
2721 case MINUS_EXPR:
2722 if (expr_invariant_in_loop_p (loop, op1))
2724 add = fold_convert (sizetype, op1);
2725 add = size_binop (MINUS_EXPR, size_zero_node, add);
2726 off = op0;
2727 goto do_add;
2729 break;
2730 case MULT_EXPR:
2731 if (scale == 1 && host_integerp (op1, 0))
2733 scale = tree_low_cst (op1, 0);
2734 off = op0;
2735 continue;
2737 break;
2738 case SSA_NAME:
2739 off = op0;
2740 continue;
2741 CASE_CONVERT:
2742 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2743 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2744 break;
2745 if (TYPE_PRECISION (TREE_TYPE (op0))
2746 == TYPE_PRECISION (TREE_TYPE (off)))
2748 off = op0;
2749 continue;
2751 if (TYPE_PRECISION (TREE_TYPE (op0))
2752 < TYPE_PRECISION (TREE_TYPE (off)))
2754 off = op0;
2755 offtype = TREE_TYPE (off);
2756 STRIP_NOPS (off);
2757 continue;
2759 break;
2760 default:
2761 break;
2763 break;
2766 /* If at the end OFF still isn't a SSA_NAME or isn't
2767 defined in the loop, punt. */
2768 if (TREE_CODE (off) != SSA_NAME
2769 || expr_invariant_in_loop_p (loop, off))
2770 return NULL_TREE;
2772 if (offtype == NULL_TREE)
2773 offtype = TREE_TYPE (off);
2775 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2776 offtype, scale);
2777 if (decl == NULL_TREE)
2778 return NULL_TREE;
2780 if (basep)
2781 *basep = base;
2782 if (offp)
2783 *offp = off;
2784 if (scalep)
2785 *scalep = scale;
2786 return decl;
2789 /* Function vect_analyze_data_refs.
2791 Find all the data references in the loop or basic block.
2793 The general structure of the analysis of data refs in the vectorizer is as
2794 follows:
2795 1- vect_analyze_data_refs(loop/bb): call
2796 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2797 in the loop/bb and their dependences.
2798 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2799 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2800 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2804 bool
2805 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2806 bb_vec_info bb_vinfo,
2807 int *min_vf)
2809 struct loop *loop = NULL;
2810 basic_block bb = NULL;
2811 unsigned int i;
2812 vec<data_reference_p> datarefs;
2813 struct data_reference *dr;
2814 tree scalar_type;
2816 if (dump_enabled_p ())
2817 dump_printf_loc (MSG_NOTE, vect_location,
2818 "=== vect_analyze_data_refs ===\n");
2820 if (loop_vinfo)
2822 loop = LOOP_VINFO_LOOP (loop_vinfo);
2823 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2824 || find_data_references_in_loop
2825 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2827 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2829 "not vectorized: loop contains function calls"
2830 " or data references that cannot be analyzed");
2831 return false;
2834 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2836 else
2838 gimple_stmt_iterator gsi;
2840 bb = BB_VINFO_BB (bb_vinfo);
2841 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2843 gimple stmt = gsi_stmt (gsi);
2844 if (!find_data_references_in_stmt (NULL, stmt,
2845 &BB_VINFO_DATAREFS (bb_vinfo)))
2847 /* Mark the rest of the basic-block as unvectorizable. */
2848 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2850 stmt = gsi_stmt (gsi);
2851 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2853 break;
2857 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2860 /* Go through the data-refs, check that the analysis succeeded. Update
2861 pointer from stmt_vec_info struct to DR and vectype. */
2863 FOR_EACH_VEC_ELT (datarefs, i, dr)
2865 gimple stmt;
2866 stmt_vec_info stmt_info;
2867 tree base, offset, init;
2868 bool gather = false;
2869 int vf;
2871 if (!dr || !DR_REF (dr))
2873 if (dump_enabled_p ())
2874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2875 "not vectorized: unhandled data-ref ");
2876 return false;
2879 stmt = DR_STMT (dr);
2880 stmt_info = vinfo_for_stmt (stmt);
2882 /* Check that analysis of the data-ref succeeded. */
2883 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2884 || !DR_STEP (dr))
2886 /* If target supports vector gather loads, see if they can't
2887 be used. */
2888 if (loop_vinfo
2889 && DR_IS_READ (dr)
2890 && !TREE_THIS_VOLATILE (DR_REF (dr))
2891 && targetm.vectorize.builtin_gather != NULL
2892 && !nested_in_vect_loop_p (loop, stmt))
2894 struct data_reference *newdr
2895 = create_data_ref (NULL, loop_containing_stmt (stmt),
2896 DR_REF (dr), stmt, true);
2897 gcc_assert (newdr != NULL && DR_REF (newdr));
2898 if (DR_BASE_ADDRESS (newdr)
2899 && DR_OFFSET (newdr)
2900 && DR_INIT (newdr)
2901 && DR_STEP (newdr)
2902 && integer_zerop (DR_STEP (newdr)))
2904 dr = newdr;
2905 gather = true;
2907 else
2908 free_data_ref (newdr);
2911 if (!gather)
2913 if (dump_enabled_p ())
2915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2916 "not vectorized: data ref analysis "
2917 "failed ");
2918 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2921 if (bb_vinfo)
2922 break;
2924 return false;
2928 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
2930 if (dump_enabled_p ())
2931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2932 "not vectorized: base addr of dr is a "
2933 "constant");
2935 if (bb_vinfo)
2936 break;
2938 if (gather)
2939 free_data_ref (dr);
2940 return false;
2943 if (TREE_THIS_VOLATILE (DR_REF (dr)))
2945 if (dump_enabled_p ())
2947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2948 "not vectorized: volatile type ");
2949 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2952 if (bb_vinfo)
2953 break;
2955 return false;
2958 if (stmt_can_throw_internal (stmt))
2960 if (dump_enabled_p ())
2962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2963 "not vectorized: statement can throw an "
2964 "exception ");
2965 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2968 if (bb_vinfo)
2969 break;
2971 if (gather)
2972 free_data_ref (dr);
2973 return false;
2976 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
2977 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
2979 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982 "not vectorized: statement is bitfield "
2983 "access ");
2984 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2987 if (bb_vinfo)
2988 break;
2990 if (gather)
2991 free_data_ref (dr);
2992 return false;
2995 base = unshare_expr (DR_BASE_ADDRESS (dr));
2996 offset = unshare_expr (DR_OFFSET (dr));
2997 init = unshare_expr (DR_INIT (dr));
2999 if (is_gimple_call (stmt))
3001 if (dump_enabled_p ())
3003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3004 "not vectorized: dr in a call ");
3005 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3008 if (bb_vinfo)
3009 break;
3011 if (gather)
3012 free_data_ref (dr);
3013 return false;
3016 /* Update DR field in stmt_vec_info struct. */
3018 /* If the dataref is in an inner-loop of the loop that is considered for
3019 for vectorization, we also want to analyze the access relative to
3020 the outer-loop (DR contains information only relative to the
3021 inner-most enclosing loop). We do that by building a reference to the
3022 first location accessed by the inner-loop, and analyze it relative to
3023 the outer-loop. */
3024 if (loop && nested_in_vect_loop_p (loop, stmt))
3026 tree outer_step, outer_base, outer_init;
3027 HOST_WIDE_INT pbitsize, pbitpos;
3028 tree poffset;
3029 enum machine_mode pmode;
3030 int punsignedp, pvolatilep;
3031 affine_iv base_iv, offset_iv;
3032 tree dinit;
3034 /* Build a reference to the first location accessed by the
3035 inner-loop: *(BASE+INIT). (The first location is actually
3036 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3037 tree inner_base = build_fold_indirect_ref
3038 (fold_build_pointer_plus (base, init));
3040 if (dump_enabled_p ())
3042 dump_printf_loc (MSG_NOTE, vect_location,
3043 "analyze in outer-loop: ");
3044 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3047 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3048 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3049 gcc_assert (outer_base != NULL_TREE);
3051 if (pbitpos % BITS_PER_UNIT != 0)
3053 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3055 "failed: bit offset alignment.\n");
3056 return false;
3059 outer_base = build_fold_addr_expr (outer_base);
3060 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3061 &base_iv, false))
3063 if (dump_enabled_p ())
3064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3065 "failed: evolution of base is not affine.\n");
3066 return false;
3069 if (offset)
3071 if (poffset)
3072 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3073 poffset);
3074 else
3075 poffset = offset;
3078 if (!poffset)
3080 offset_iv.base = ssize_int (0);
3081 offset_iv.step = ssize_int (0);
3083 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3084 &offset_iv, false))
3086 if (dump_enabled_p ())
3087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3088 "evolution of offset is not affine.\n");
3089 return false;
3092 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3093 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3094 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3095 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3096 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3098 outer_step = size_binop (PLUS_EXPR,
3099 fold_convert (ssizetype, base_iv.step),
3100 fold_convert (ssizetype, offset_iv.step));
3102 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3103 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3104 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3105 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3106 STMT_VINFO_DR_OFFSET (stmt_info) =
3107 fold_convert (ssizetype, offset_iv.base);
3108 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3109 size_int (highest_pow2_factor (offset_iv.base));
3111 if (dump_enabled_p ())
3113 dump_printf_loc (MSG_NOTE, vect_location,
3114 "\touter base_address: ");
3115 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3116 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3117 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3118 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3119 STMT_VINFO_DR_OFFSET (stmt_info));
3120 dump_printf (MSG_NOTE,
3121 "\n\touter constant offset from base address: ");
3122 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3123 STMT_VINFO_DR_INIT (stmt_info));
3124 dump_printf (MSG_NOTE, "\n\touter step: ");
3125 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126 STMT_VINFO_DR_STEP (stmt_info));
3127 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3128 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3129 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3133 if (STMT_VINFO_DATA_REF (stmt_info))
3135 if (dump_enabled_p ())
3137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3138 "not vectorized: more than one data ref "
3139 "in stmt: ");
3140 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3143 if (bb_vinfo)
3144 break;
3146 if (gather)
3147 free_data_ref (dr);
3148 return false;
3151 STMT_VINFO_DATA_REF (stmt_info) = dr;
3153 /* Set vectype for STMT. */
3154 scalar_type = TREE_TYPE (DR_REF (dr));
3155 STMT_VINFO_VECTYPE (stmt_info) =
3156 get_vectype_for_scalar_type (scalar_type);
3157 if (!STMT_VINFO_VECTYPE (stmt_info))
3159 if (dump_enabled_p ())
3161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3162 "not vectorized: no vectype for stmt: ");
3163 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3164 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3165 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3166 scalar_type);
3169 if (bb_vinfo)
3170 break;
3172 if (gather)
3174 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3175 free_data_ref (dr);
3177 return false;
3179 else
3181 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_NOTE, vect_location,
3184 "got vectype for stmt: ");
3185 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3186 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3187 STMT_VINFO_VECTYPE (stmt_info));
3191 /* Adjust the minimal vectorization factor according to the
3192 vector type. */
3193 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3194 if (vf > *min_vf)
3195 *min_vf = vf;
3197 if (gather)
3199 tree off;
3201 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3202 if (gather
3203 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3204 gather = false;
3205 if (!gather)
3207 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3208 free_data_ref (dr);
3209 if (dump_enabled_p ())
3211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3212 "not vectorized: not suitable for gather "
3213 "load ");
3214 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3216 return false;
3219 datarefs[i] = dr;
3220 STMT_VINFO_GATHER_P (stmt_info) = true;
3222 else if (loop_vinfo
3223 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3225 if (nested_in_vect_loop_p (loop, stmt)
3226 || !DR_IS_READ (dr))
3228 if (dump_enabled_p ())
3230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3231 "not vectorized: not suitable for strided "
3232 "load ");
3233 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3235 return false;
3237 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3241 /* If we stopped analysis at the first dataref we could not analyze
3242 when trying to vectorize a basic-block mark the rest of the datarefs
3243 as not vectorizable and truncate the vector of datarefs. That
3244 avoids spending useless time in analyzing their dependence. */
3245 if (i != datarefs.length ())
3247 gcc_assert (bb_vinfo != NULL);
3248 for (unsigned j = i; j < datarefs.length (); ++j)
3250 data_reference_p dr = datarefs[j];
3251 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3252 free_data_ref (dr);
3254 datarefs.truncate (i);
3257 return true;
3261 /* Function vect_get_new_vect_var.
3263 Returns a name for a new variable. The current naming scheme appends the
3264 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3265 the name of vectorizer generated variables, and appends that to NAME if
3266 provided. */
3268 tree
3269 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3271 const char *prefix;
3272 tree new_vect_var;
3274 switch (var_kind)
3276 case vect_simple_var:
3277 prefix = "vect";
3278 break;
3279 case vect_scalar_var:
3280 prefix = "stmp";
3281 break;
3282 case vect_pointer_var:
3283 prefix = "vectp";
3284 break;
3285 default:
3286 gcc_unreachable ();
3289 if (name)
3291 char* tmp = concat (prefix, "_", name, NULL);
3292 new_vect_var = create_tmp_reg (type, tmp);
3293 free (tmp);
3295 else
3296 new_vect_var = create_tmp_reg (type, prefix);
3298 return new_vect_var;
3302 /* Function vect_create_addr_base_for_vector_ref.
3304 Create an expression that computes the address of the first memory location
3305 that will be accessed for a data reference.
3307 Input:
3308 STMT: The statement containing the data reference.
3309 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3310 OFFSET: Optional. If supplied, it is be added to the initial address.
3311 LOOP: Specify relative to which loop-nest should the address be computed.
3312 For example, when the dataref is in an inner-loop nested in an
3313 outer-loop that is now being vectorized, LOOP can be either the
3314 outer-loop, or the inner-loop. The first memory location accessed
3315 by the following dataref ('in' points to short):
3317 for (i=0; i<N; i++)
3318 for (j=0; j<M; j++)
3319 s += in[i+j]
3321 is as follows:
3322 if LOOP=i_loop: &in (relative to i_loop)
3323 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3325 Output:
3326 1. Return an SSA_NAME whose value is the address of the memory location of
3327 the first vector of the data reference.
3328 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3329 these statement(s) which define the returned SSA_NAME.
3331 FORNOW: We are only handling array accesses with step 1. */
3333 tree
3334 vect_create_addr_base_for_vector_ref (gimple stmt,
3335 gimple_seq *new_stmt_list,
3336 tree offset,
3337 struct loop *loop)
3339 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3340 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3341 tree data_ref_base;
3342 const char *base_name;
3343 tree addr_base;
3344 tree dest;
3345 gimple_seq seq = NULL;
3346 tree base_offset;
3347 tree init;
3348 tree vect_ptr_type;
3349 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3350 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3352 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3354 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3356 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3358 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3359 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3360 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3362 else
3364 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3365 base_offset = unshare_expr (DR_OFFSET (dr));
3366 init = unshare_expr (DR_INIT (dr));
3369 if (loop_vinfo)
3370 base_name = get_name (data_ref_base);
3371 else
3373 base_offset = ssize_int (0);
3374 init = ssize_int (0);
3375 base_name = get_name (DR_REF (dr));
3378 /* Create base_offset */
3379 base_offset = size_binop (PLUS_EXPR,
3380 fold_convert (sizetype, base_offset),
3381 fold_convert (sizetype, init));
3383 if (offset)
3385 offset = fold_build2 (MULT_EXPR, sizetype,
3386 fold_convert (sizetype, offset), step);
3387 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3388 base_offset, offset);
3391 /* base + base_offset */
3392 if (loop_vinfo)
3393 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3394 else
3396 addr_base = build1 (ADDR_EXPR,
3397 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3398 unshare_expr (DR_REF (dr)));
3401 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3402 addr_base = fold_convert (vect_ptr_type, addr_base);
3403 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3404 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3405 gimple_seq_add_seq (new_stmt_list, seq);
3407 if (DR_PTR_INFO (dr)
3408 && TREE_CODE (addr_base) == SSA_NAME)
3410 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3411 if (offset)
3412 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3415 if (dump_enabled_p ())
3417 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3418 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3421 return addr_base;
3425 /* Function vect_create_data_ref_ptr.
3427 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3428 location accessed in the loop by STMT, along with the def-use update
3429 chain to appropriately advance the pointer through the loop iterations.
3430 Also set aliasing information for the pointer. This pointer is used by
3431 the callers to this function to create a memory reference expression for
3432 vector load/store access.
3434 Input:
3435 1. STMT: a stmt that references memory. Expected to be of the form
3436 GIMPLE_ASSIGN <name, data-ref> or
3437 GIMPLE_ASSIGN <data-ref, name>.
3438 2. AGGR_TYPE: the type of the reference, which should be either a vector
3439 or an array.
3440 3. AT_LOOP: the loop where the vector memref is to be created.
3441 4. OFFSET (optional): an offset to be added to the initial address accessed
3442 by the data-ref in STMT.
3443 5. BSI: location where the new stmts are to be placed if there is no loop
3444 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3445 pointing to the initial address.
3447 Output:
3448 1. Declare a new ptr to vector_type, and have it point to the base of the
3449 data reference (initial addressed accessed by the data reference).
3450 For example, for vector of type V8HI, the following code is generated:
3452 v8hi *ap;
3453 ap = (v8hi *)initial_address;
3455 if OFFSET is not supplied:
3456 initial_address = &a[init];
3457 if OFFSET is supplied:
3458 initial_address = &a[init + OFFSET];
3460 Return the initial_address in INITIAL_ADDRESS.
3462 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3463 update the pointer in each iteration of the loop.
3465 Return the increment stmt that updates the pointer in PTR_INCR.
3467 3. Set INV_P to true if the access pattern of the data reference in the
3468 vectorized loop is invariant. Set it to false otherwise.
3470 4. Return the pointer. */
3472 tree
3473 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3474 tree offset, tree *initial_address,
3475 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3476 bool only_init, bool *inv_p)
3478 const char *base_name;
3479 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3480 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3481 struct loop *loop = NULL;
3482 bool nested_in_vect_loop = false;
3483 struct loop *containing_loop = NULL;
3484 tree aggr_ptr_type;
3485 tree aggr_ptr;
3486 tree new_temp;
3487 gimple vec_stmt;
3488 gimple_seq new_stmt_list = NULL;
3489 edge pe = NULL;
3490 basic_block new_bb;
3491 tree aggr_ptr_init;
3492 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3493 tree aptr;
3494 gimple_stmt_iterator incr_gsi;
3495 bool insert_after;
3496 tree indx_before_incr, indx_after_incr;
3497 gimple incr;
3498 tree step;
3499 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3501 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3502 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3504 if (loop_vinfo)
3506 loop = LOOP_VINFO_LOOP (loop_vinfo);
3507 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3508 containing_loop = (gimple_bb (stmt))->loop_father;
3509 pe = loop_preheader_edge (loop);
3511 else
3513 gcc_assert (bb_vinfo);
3514 only_init = true;
3515 *ptr_incr = NULL;
3518 /* Check the step (evolution) of the load in LOOP, and record
3519 whether it's invariant. */
3520 if (nested_in_vect_loop)
3521 step = STMT_VINFO_DR_STEP (stmt_info);
3522 else
3523 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3525 if (integer_zerop (step))
3526 *inv_p = true;
3527 else
3528 *inv_p = false;
3530 /* Create an expression for the first address accessed by this load
3531 in LOOP. */
3532 base_name = get_name (DR_BASE_ADDRESS (dr));
3534 if (dump_enabled_p ())
3536 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3537 dump_printf_loc (MSG_NOTE, vect_location,
3538 "create %s-pointer variable to type: ",
3539 tree_code_name[(int) TREE_CODE (aggr_type)]);
3540 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3541 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3542 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3543 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3544 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3545 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3546 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3547 else
3548 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3549 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3552 /* (1) Create the new aggregate-pointer variable.
3553 Vector and array types inherit the alias set of their component
3554 type by default so we need to use a ref-all pointer if the data
3555 reference does not conflict with the created aggregated data
3556 reference because it is not addressable. */
3557 bool need_ref_all = false;
3558 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3559 get_alias_set (DR_REF (dr))))
3560 need_ref_all = true;
3561 /* Likewise for any of the data references in the stmt group. */
3562 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3564 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3567 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3568 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3569 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3570 get_alias_set (DR_REF (sdr))))
3572 need_ref_all = true;
3573 break;
3575 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3577 while (orig_stmt);
3579 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3580 need_ref_all);
3581 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3584 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3585 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3586 def-use update cycles for the pointer: one relative to the outer-loop
3587 (LOOP), which is what steps (3) and (4) below do. The other is relative
3588 to the inner-loop (which is the inner-most loop containing the dataref),
3589 and this is done be step (5) below.
3591 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3592 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3593 redundant. Steps (3),(4) create the following:
3595 vp0 = &base_addr;
3596 LOOP: vp1 = phi(vp0,vp2)
3599 vp2 = vp1 + step
3600 goto LOOP
3602 If there is an inner-loop nested in loop, then step (5) will also be
3603 applied, and an additional update in the inner-loop will be created:
3605 vp0 = &base_addr;
3606 LOOP: vp1 = phi(vp0,vp2)
3608 inner: vp3 = phi(vp1,vp4)
3609 vp4 = vp3 + inner_step
3610 if () goto inner
3612 vp2 = vp1 + step
3613 if () goto LOOP */
3615 /* (2) Calculate the initial address of the aggregate-pointer, and set
3616 the aggregate-pointer to point to it before the loop. */
3618 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3620 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3621 offset, loop);
3622 if (new_stmt_list)
3624 if (pe)
3626 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3627 gcc_assert (!new_bb);
3629 else
3630 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3633 *initial_address = new_temp;
3635 /* Create: p = (aggr_type *) initial_base */
3636 if (TREE_CODE (new_temp) != SSA_NAME
3637 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3639 vec_stmt = gimple_build_assign (aggr_ptr,
3640 fold_convert (aggr_ptr_type, new_temp));
3641 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3642 /* Copy the points-to information if it exists. */
3643 if (DR_PTR_INFO (dr))
3644 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3645 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3646 if (pe)
3648 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3649 gcc_assert (!new_bb);
3651 else
3652 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3654 else
3655 aggr_ptr_init = new_temp;
3657 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3658 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3659 inner-loop nested in LOOP (during outer-loop vectorization). */
3661 /* No update in loop is required. */
3662 if (only_init && (!loop_vinfo || at_loop == loop))
3663 aptr = aggr_ptr_init;
3664 else
3666 /* The step of the aggregate pointer is the type size. */
3667 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3668 /* One exception to the above is when the scalar step of the load in
3669 LOOP is zero. In this case the step here is also zero. */
3670 if (*inv_p)
3671 iv_step = size_zero_node;
3672 else if (tree_int_cst_sgn (step) == -1)
3673 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3675 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3677 create_iv (aggr_ptr_init,
3678 fold_convert (aggr_ptr_type, iv_step),
3679 aggr_ptr, loop, &incr_gsi, insert_after,
3680 &indx_before_incr, &indx_after_incr);
3681 incr = gsi_stmt (incr_gsi);
3682 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3684 /* Copy the points-to information if it exists. */
3685 if (DR_PTR_INFO (dr))
3687 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3688 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3690 if (ptr_incr)
3691 *ptr_incr = incr;
3693 aptr = indx_before_incr;
3696 if (!nested_in_vect_loop || only_init)
3697 return aptr;
3700 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3701 nested in LOOP, if exists. */
3703 gcc_assert (nested_in_vect_loop);
3704 if (!only_init)
3706 standard_iv_increment_position (containing_loop, &incr_gsi,
3707 &insert_after);
3708 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3709 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3710 &indx_after_incr);
3711 incr = gsi_stmt (incr_gsi);
3712 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3714 /* Copy the points-to information if it exists. */
3715 if (DR_PTR_INFO (dr))
3717 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3718 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3720 if (ptr_incr)
3721 *ptr_incr = incr;
3723 return indx_before_incr;
3725 else
3726 gcc_unreachable ();
3730 /* Function bump_vector_ptr
3732 Increment a pointer (to a vector type) by vector-size. If requested,
3733 i.e. if PTR-INCR is given, then also connect the new increment stmt
3734 to the existing def-use update-chain of the pointer, by modifying
3735 the PTR_INCR as illustrated below:
3737 The pointer def-use update-chain before this function:
3738 DATAREF_PTR = phi (p_0, p_2)
3739 ....
3740 PTR_INCR: p_2 = DATAREF_PTR + step
3742 The pointer def-use update-chain after this function:
3743 DATAREF_PTR = phi (p_0, p_2)
3744 ....
3745 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3746 ....
3747 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3749 Input:
3750 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3751 in the loop.
3752 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3753 the loop. The increment amount across iterations is expected
3754 to be vector_size.
3755 BSI - location where the new update stmt is to be placed.
3756 STMT - the original scalar memory-access stmt that is being vectorized.
3757 BUMP - optional. The offset by which to bump the pointer. If not given,
3758 the offset is assumed to be vector_size.
3760 Output: Return NEW_DATAREF_PTR as illustrated above.
3764 tree
3765 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3766 gimple stmt, tree bump)
3768 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3769 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3770 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3771 tree update = TYPE_SIZE_UNIT (vectype);
3772 gimple incr_stmt;
3773 ssa_op_iter iter;
3774 use_operand_p use_p;
3775 tree new_dataref_ptr;
3777 if (bump)
3778 update = bump;
3780 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3781 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3782 dataref_ptr, update);
3783 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3785 /* Copy the points-to information if it exists. */
3786 if (DR_PTR_INFO (dr))
3788 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3789 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3792 if (!ptr_incr)
3793 return new_dataref_ptr;
3795 /* Update the vector-pointer's cross-iteration increment. */
3796 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3798 tree use = USE_FROM_PTR (use_p);
3800 if (use == dataref_ptr)
3801 SET_USE (use_p, new_dataref_ptr);
3802 else
3803 gcc_assert (tree_int_cst_compare (use, update) == 0);
3806 return new_dataref_ptr;
3810 /* Function vect_create_destination_var.
3812 Create a new temporary of type VECTYPE. */
3814 tree
3815 vect_create_destination_var (tree scalar_dest, tree vectype)
3817 tree vec_dest;
3818 const char *name;
3819 char *new_name;
3820 tree type;
3821 enum vect_var_kind kind;
3823 kind = vectype ? vect_simple_var : vect_scalar_var;
3824 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3826 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3828 name = get_name (scalar_dest);
3829 if (name)
3830 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
3831 else
3832 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
3833 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3834 free (new_name);
3836 return vec_dest;
3839 /* Function vect_grouped_store_supported.
3841 Returns TRUE if interleave high and interleave low permutations
3842 are supported, and FALSE otherwise. */
3844 bool
3845 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3847 enum machine_mode mode = TYPE_MODE (vectype);
3849 /* vect_permute_store_chain requires the group size to be a power of two. */
3850 if (exact_log2 (count) == -1)
3852 if (dump_enabled_p ())
3853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3854 "the size of the group of accesses"
3855 " is not a power of 2");
3856 return false;
3859 /* Check that the permutation is supported. */
3860 if (VECTOR_MODE_P (mode))
3862 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3863 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3864 for (i = 0; i < nelt / 2; i++)
3866 sel[i * 2] = i;
3867 sel[i * 2 + 1] = i + nelt;
3869 if (can_vec_perm_p (mode, false, sel))
3871 for (i = 0; i < nelt; i++)
3872 sel[i] += nelt / 2;
3873 if (can_vec_perm_p (mode, false, sel))
3874 return true;
3878 if (dump_enabled_p ())
3879 dump_printf (MSG_MISSED_OPTIMIZATION,
3880 "interleave op not supported by target.");
3881 return false;
3885 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3886 type VECTYPE. */
3888 bool
3889 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3891 return vect_lanes_optab_supported_p ("vec_store_lanes",
3892 vec_store_lanes_optab,
3893 vectype, count);
3897 /* Function vect_permute_store_chain.
3899 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3900 a power of 2, generate interleave_high/low stmts to reorder the data
3901 correctly for the stores. Return the final references for stores in
3902 RESULT_CHAIN.
3904 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3905 The input is 4 vectors each containing 8 elements. We assign a number to
3906 each element, the input sequence is:
3908 1st vec: 0 1 2 3 4 5 6 7
3909 2nd vec: 8 9 10 11 12 13 14 15
3910 3rd vec: 16 17 18 19 20 21 22 23
3911 4th vec: 24 25 26 27 28 29 30 31
3913 The output sequence should be:
3915 1st vec: 0 8 16 24 1 9 17 25
3916 2nd vec: 2 10 18 26 3 11 19 27
3917 3rd vec: 4 12 20 28 5 13 21 30
3918 4th vec: 6 14 22 30 7 15 23 31
3920 i.e., we interleave the contents of the four vectors in their order.
3922 We use interleave_high/low instructions to create such output. The input of
3923 each interleave_high/low operation is two vectors:
3924 1st vec 2nd vec
3925 0 1 2 3 4 5 6 7
3926 the even elements of the result vector are obtained left-to-right from the
3927 high/low elements of the first vector. The odd elements of the result are
3928 obtained left-to-right from the high/low elements of the second vector.
3929 The output of interleave_high will be: 0 4 1 5
3930 and of interleave_low: 2 6 3 7
3933 The permutation is done in log LENGTH stages. In each stage interleave_high
3934 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3935 where the first argument is taken from the first half of DR_CHAIN and the
3936 second argument from it's second half.
3937 In our example,
3939 I1: interleave_high (1st vec, 3rd vec)
3940 I2: interleave_low (1st vec, 3rd vec)
3941 I3: interleave_high (2nd vec, 4th vec)
3942 I4: interleave_low (2nd vec, 4th vec)
3944 The output for the first stage is:
3946 I1: 0 16 1 17 2 18 3 19
3947 I2: 4 20 5 21 6 22 7 23
3948 I3: 8 24 9 25 10 26 11 27
3949 I4: 12 28 13 29 14 30 15 31
3951 The output of the second stage, i.e. the final result is:
3953 I1: 0 8 16 24 1 9 17 25
3954 I2: 2 10 18 26 3 11 19 27
3955 I3: 4 12 20 28 5 13 21 30
3956 I4: 6 14 22 30 7 15 23 31. */
3958 void
3959 vect_permute_store_chain (vec<tree> dr_chain,
3960 unsigned int length,
3961 gimple stmt,
3962 gimple_stmt_iterator *gsi,
3963 vec<tree> *result_chain)
3965 tree vect1, vect2, high, low;
3966 gimple perm_stmt;
3967 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3968 tree perm_mask_low, perm_mask_high;
3969 unsigned int i, n;
3970 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
3971 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3973 result_chain->quick_grow (length);
3974 memcpy (result_chain->address (), dr_chain.address (),
3975 length * sizeof (tree));
3977 for (i = 0, n = nelt / 2; i < n; i++)
3979 sel[i * 2] = i;
3980 sel[i * 2 + 1] = i + nelt;
3982 perm_mask_high = vect_gen_perm_mask (vectype, sel);
3983 gcc_assert (perm_mask_high != NULL);
3985 for (i = 0; i < nelt; i++)
3986 sel[i] += nelt / 2;
3987 perm_mask_low = vect_gen_perm_mask (vectype, sel);
3988 gcc_assert (perm_mask_low != NULL);
3990 for (i = 0, n = exact_log2 (length); i < n; i++)
3992 for (j = 0; j < length/2; j++)
3994 vect1 = dr_chain[j];
3995 vect2 = dr_chain[j+length/2];
3997 /* Create interleaving stmt:
3998 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
3999 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4000 perm_stmt
4001 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4002 vect1, vect2, perm_mask_high);
4003 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4004 (*result_chain)[2*j] = high;
4006 /* Create interleaving stmt:
4007 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4008 nelt*3/2+1, ...}> */
4009 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4010 perm_stmt
4011 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4012 vect1, vect2, perm_mask_low);
4013 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4014 (*result_chain)[2*j+1] = low;
4016 memcpy (dr_chain.address (), result_chain->address (),
4017 length * sizeof (tree));
4021 /* Function vect_setup_realignment
4023 This function is called when vectorizing an unaligned load using
4024 the dr_explicit_realign[_optimized] scheme.
4025 This function generates the following code at the loop prolog:
4027 p = initial_addr;
4028 x msq_init = *(floor(p)); # prolog load
4029 realignment_token = call target_builtin;
4030 loop:
4031 x msq = phi (msq_init, ---)
4033 The stmts marked with x are generated only for the case of
4034 dr_explicit_realign_optimized.
4036 The code above sets up a new (vector) pointer, pointing to the first
4037 location accessed by STMT, and a "floor-aligned" load using that pointer.
4038 It also generates code to compute the "realignment-token" (if the relevant
4039 target hook was defined), and creates a phi-node at the loop-header bb
4040 whose arguments are the result of the prolog-load (created by this
4041 function) and the result of a load that takes place in the loop (to be
4042 created by the caller to this function).
4044 For the case of dr_explicit_realign_optimized:
4045 The caller to this function uses the phi-result (msq) to create the
4046 realignment code inside the loop, and sets up the missing phi argument,
4047 as follows:
4048 loop:
4049 msq = phi (msq_init, lsq)
4050 lsq = *(floor(p')); # load in loop
4051 result = realign_load (msq, lsq, realignment_token);
4053 For the case of dr_explicit_realign:
4054 loop:
4055 msq = *(floor(p)); # load in loop
4056 p' = p + (VS-1);
4057 lsq = *(floor(p')); # load in loop
4058 result = realign_load (msq, lsq, realignment_token);
4060 Input:
4061 STMT - (scalar) load stmt to be vectorized. This load accesses
4062 a memory location that may be unaligned.
4063 BSI - place where new code is to be inserted.
4064 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4065 is used.
4067 Output:
4068 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4069 target hook, if defined.
4070 Return value - the result of the loop-header phi node. */
4072 tree
4073 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4074 tree *realignment_token,
4075 enum dr_alignment_support alignment_support_scheme,
4076 tree init_addr,
4077 struct loop **at_loop)
4079 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4080 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4081 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4082 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4083 struct loop *loop = NULL;
4084 edge pe = NULL;
4085 tree scalar_dest = gimple_assign_lhs (stmt);
4086 tree vec_dest;
4087 gimple inc;
4088 tree ptr;
4089 tree data_ref;
4090 gimple new_stmt;
4091 basic_block new_bb;
4092 tree msq_init = NULL_TREE;
4093 tree new_temp;
4094 gimple phi_stmt;
4095 tree msq = NULL_TREE;
4096 gimple_seq stmts = NULL;
4097 bool inv_p;
4098 bool compute_in_loop = false;
4099 bool nested_in_vect_loop = false;
4100 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4101 struct loop *loop_for_initial_load = NULL;
4103 if (loop_vinfo)
4105 loop = LOOP_VINFO_LOOP (loop_vinfo);
4106 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4109 gcc_assert (alignment_support_scheme == dr_explicit_realign
4110 || alignment_support_scheme == dr_explicit_realign_optimized);
4112 /* We need to generate three things:
4113 1. the misalignment computation
4114 2. the extra vector load (for the optimized realignment scheme).
4115 3. the phi node for the two vectors from which the realignment is
4116 done (for the optimized realignment scheme). */
4118 /* 1. Determine where to generate the misalignment computation.
4120 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4121 calculation will be generated by this function, outside the loop (in the
4122 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4123 caller, inside the loop.
4125 Background: If the misalignment remains fixed throughout the iterations of
4126 the loop, then both realignment schemes are applicable, and also the
4127 misalignment computation can be done outside LOOP. This is because we are
4128 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4129 are a multiple of VS (the Vector Size), and therefore the misalignment in
4130 different vectorized LOOP iterations is always the same.
4131 The problem arises only if the memory access is in an inner-loop nested
4132 inside LOOP, which is now being vectorized using outer-loop vectorization.
4133 This is the only case when the misalignment of the memory access may not
4134 remain fixed throughout the iterations of the inner-loop (as explained in
4135 detail in vect_supportable_dr_alignment). In this case, not only is the
4136 optimized realignment scheme not applicable, but also the misalignment
4137 computation (and generation of the realignment token that is passed to
4138 REALIGN_LOAD) have to be done inside the loop.
4140 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4141 or not, which in turn determines if the misalignment is computed inside
4142 the inner-loop, or outside LOOP. */
4144 if (init_addr != NULL_TREE || !loop_vinfo)
4146 compute_in_loop = true;
4147 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4151 /* 2. Determine where to generate the extra vector load.
4153 For the optimized realignment scheme, instead of generating two vector
4154 loads in each iteration, we generate a single extra vector load in the
4155 preheader of the loop, and in each iteration reuse the result of the
4156 vector load from the previous iteration. In case the memory access is in
4157 an inner-loop nested inside LOOP, which is now being vectorized using
4158 outer-loop vectorization, we need to determine whether this initial vector
4159 load should be generated at the preheader of the inner-loop, or can be
4160 generated at the preheader of LOOP. If the memory access has no evolution
4161 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4162 to be generated inside LOOP (in the preheader of the inner-loop). */
4164 if (nested_in_vect_loop)
4166 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4167 bool invariant_in_outerloop =
4168 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4169 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4171 else
4172 loop_for_initial_load = loop;
4173 if (at_loop)
4174 *at_loop = loop_for_initial_load;
4176 if (loop_for_initial_load)
4177 pe = loop_preheader_edge (loop_for_initial_load);
4179 /* 3. For the case of the optimized realignment, create the first vector
4180 load at the loop preheader. */
4182 if (alignment_support_scheme == dr_explicit_realign_optimized)
4184 /* Create msq_init = *(floor(p1)) in the loop preheader */
4186 gcc_assert (!compute_in_loop);
4187 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4188 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4189 NULL_TREE, &init_addr, NULL, &inc,
4190 true, &inv_p);
4191 new_temp = copy_ssa_name (ptr, NULL);
4192 new_stmt = gimple_build_assign_with_ops
4193 (BIT_AND_EXPR, new_temp, ptr,
4194 build_int_cst (TREE_TYPE (ptr),
4195 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4196 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4197 gcc_assert (!new_bb);
4198 data_ref
4199 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4200 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4201 new_stmt = gimple_build_assign (vec_dest, data_ref);
4202 new_temp = make_ssa_name (vec_dest, new_stmt);
4203 gimple_assign_set_lhs (new_stmt, new_temp);
4204 if (pe)
4206 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4207 gcc_assert (!new_bb);
4209 else
4210 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4212 msq_init = gimple_assign_lhs (new_stmt);
4215 /* 4. Create realignment token using a target builtin, if available.
4216 It is done either inside the containing loop, or before LOOP (as
4217 determined above). */
4219 if (targetm.vectorize.builtin_mask_for_load)
4221 tree builtin_decl;
4223 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4224 if (!init_addr)
4226 /* Generate the INIT_ADDR computation outside LOOP. */
4227 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4228 NULL_TREE, loop);
4229 if (loop)
4231 pe = loop_preheader_edge (loop);
4232 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4233 gcc_assert (!new_bb);
4235 else
4236 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4239 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4240 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4241 vec_dest =
4242 vect_create_destination_var (scalar_dest,
4243 gimple_call_return_type (new_stmt));
4244 new_temp = make_ssa_name (vec_dest, new_stmt);
4245 gimple_call_set_lhs (new_stmt, new_temp);
4247 if (compute_in_loop)
4248 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4249 else
4251 /* Generate the misalignment computation outside LOOP. */
4252 pe = loop_preheader_edge (loop);
4253 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4254 gcc_assert (!new_bb);
4257 *realignment_token = gimple_call_lhs (new_stmt);
4259 /* The result of the CALL_EXPR to this builtin is determined from
4260 the value of the parameter and no global variables are touched
4261 which makes the builtin a "const" function. Requiring the
4262 builtin to have the "const" attribute makes it unnecessary
4263 to call mark_call_clobbered. */
4264 gcc_assert (TREE_READONLY (builtin_decl));
4267 if (alignment_support_scheme == dr_explicit_realign)
4268 return msq;
4270 gcc_assert (!compute_in_loop);
4271 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4274 /* 5. Create msq = phi <msq_init, lsq> in loop */
4276 pe = loop_preheader_edge (containing_loop);
4277 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4278 msq = make_ssa_name (vec_dest, NULL);
4279 phi_stmt = create_phi_node (msq, containing_loop->header);
4280 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4282 return msq;
4286 /* Function vect_grouped_load_supported.
4288 Returns TRUE if even and odd permutations are supported,
4289 and FALSE otherwise. */
4291 bool
4292 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4294 enum machine_mode mode = TYPE_MODE (vectype);
4296 /* vect_permute_load_chain requires the group size to be a power of two. */
4297 if (exact_log2 (count) == -1)
4299 if (dump_enabled_p ())
4300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4301 "the size of the group of accesses"
4302 " is not a power of 2");
4303 return false;
4306 /* Check that the permutation is supported. */
4307 if (VECTOR_MODE_P (mode))
4309 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4310 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4312 for (i = 0; i < nelt; i++)
4313 sel[i] = i * 2;
4314 if (can_vec_perm_p (mode, false, sel))
4316 for (i = 0; i < nelt; i++)
4317 sel[i] = i * 2 + 1;
4318 if (can_vec_perm_p (mode, false, sel))
4319 return true;
4323 if (dump_enabled_p ())
4324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4325 "extract even/odd not supported by target");
4326 return false;
4329 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4330 type VECTYPE. */
4332 bool
4333 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4335 return vect_lanes_optab_supported_p ("vec_load_lanes",
4336 vec_load_lanes_optab,
4337 vectype, count);
4340 /* Function vect_permute_load_chain.
4342 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4343 a power of 2, generate extract_even/odd stmts to reorder the input data
4344 correctly. Return the final references for loads in RESULT_CHAIN.
4346 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4347 The input is 4 vectors each containing 8 elements. We assign a number to each
4348 element, the input sequence is:
4350 1st vec: 0 1 2 3 4 5 6 7
4351 2nd vec: 8 9 10 11 12 13 14 15
4352 3rd vec: 16 17 18 19 20 21 22 23
4353 4th vec: 24 25 26 27 28 29 30 31
4355 The output sequence should be:
4357 1st vec: 0 4 8 12 16 20 24 28
4358 2nd vec: 1 5 9 13 17 21 25 29
4359 3rd vec: 2 6 10 14 18 22 26 30
4360 4th vec: 3 7 11 15 19 23 27 31
4362 i.e., the first output vector should contain the first elements of each
4363 interleaving group, etc.
4365 We use extract_even/odd instructions to create such output. The input of
4366 each extract_even/odd operation is two vectors
4367 1st vec 2nd vec
4368 0 1 2 3 4 5 6 7
4370 and the output is the vector of extracted even/odd elements. The output of
4371 extract_even will be: 0 2 4 6
4372 and of extract_odd: 1 3 5 7
4375 The permutation is done in log LENGTH stages. In each stage extract_even
4376 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4377 their order. In our example,
4379 E1: extract_even (1st vec, 2nd vec)
4380 E2: extract_odd (1st vec, 2nd vec)
4381 E3: extract_even (3rd vec, 4th vec)
4382 E4: extract_odd (3rd vec, 4th vec)
4384 The output for the first stage will be:
4386 E1: 0 2 4 6 8 10 12 14
4387 E2: 1 3 5 7 9 11 13 15
4388 E3: 16 18 20 22 24 26 28 30
4389 E4: 17 19 21 23 25 27 29 31
4391 In order to proceed and create the correct sequence for the next stage (or
4392 for the correct output, if the second stage is the last one, as in our
4393 example), we first put the output of extract_even operation and then the
4394 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4395 The input for the second stage is:
4397 1st vec (E1): 0 2 4 6 8 10 12 14
4398 2nd vec (E3): 16 18 20 22 24 26 28 30
4399 3rd vec (E2): 1 3 5 7 9 11 13 15
4400 4th vec (E4): 17 19 21 23 25 27 29 31
4402 The output of the second stage:
4404 E1: 0 4 8 12 16 20 24 28
4405 E2: 2 6 10 14 18 22 26 30
4406 E3: 1 5 9 13 17 21 25 29
4407 E4: 3 7 11 15 19 23 27 31
4409 And RESULT_CHAIN after reordering:
4411 1st vec (E1): 0 4 8 12 16 20 24 28
4412 2nd vec (E3): 1 5 9 13 17 21 25 29
4413 3rd vec (E2): 2 6 10 14 18 22 26 30
4414 4th vec (E4): 3 7 11 15 19 23 27 31. */
4416 static void
4417 vect_permute_load_chain (vec<tree> dr_chain,
4418 unsigned int length,
4419 gimple stmt,
4420 gimple_stmt_iterator *gsi,
4421 vec<tree> *result_chain)
4423 tree data_ref, first_vect, second_vect;
4424 tree perm_mask_even, perm_mask_odd;
4425 gimple perm_stmt;
4426 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4427 unsigned int i, j, log_length = exact_log2 (length);
4428 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4429 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4431 result_chain->quick_grow (length);
4432 memcpy (result_chain->address (), dr_chain.address (),
4433 length * sizeof (tree));
4435 for (i = 0; i < nelt; ++i)
4436 sel[i] = i * 2;
4437 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4438 gcc_assert (perm_mask_even != NULL);
4440 for (i = 0; i < nelt; ++i)
4441 sel[i] = i * 2 + 1;
4442 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4443 gcc_assert (perm_mask_odd != NULL);
4445 for (i = 0; i < log_length; i++)
4447 for (j = 0; j < length; j += 2)
4449 first_vect = dr_chain[j];
4450 second_vect = dr_chain[j+1];
4452 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4453 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4454 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4455 first_vect, second_vect,
4456 perm_mask_even);
4457 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4458 (*result_chain)[j/2] = data_ref;
4460 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4461 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4462 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4463 first_vect, second_vect,
4464 perm_mask_odd);
4465 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4466 (*result_chain)[j/2+length/2] = data_ref;
4468 memcpy (dr_chain.address (), result_chain->address (),
4469 length * sizeof (tree));
4474 /* Function vect_transform_grouped_load.
4476 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4477 to perform their permutation and ascribe the result vectorized statements to
4478 the scalar statements.
4481 void
4482 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4483 gimple_stmt_iterator *gsi)
4485 vec<tree> result_chain = vNULL;
4487 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4488 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4489 vectors, that are ready for vector computation. */
4490 result_chain.create (size);
4491 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4492 vect_record_grouped_load_vectors (stmt, result_chain);
4493 result_chain.release ();
4496 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4497 generated as part of the vectorization of STMT. Assign the statement
4498 for each vector to the associated scalar statement. */
4500 void
4501 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4503 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4504 gimple next_stmt, new_stmt;
4505 unsigned int i, gap_count;
4506 tree tmp_data_ref;
4508 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4509 Since we scan the chain starting from it's first node, their order
4510 corresponds the order of data-refs in RESULT_CHAIN. */
4511 next_stmt = first_stmt;
4512 gap_count = 1;
4513 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4515 if (!next_stmt)
4516 break;
4518 /* Skip the gaps. Loads created for the gaps will be removed by dead
4519 code elimination pass later. No need to check for the first stmt in
4520 the group, since it always exists.
4521 GROUP_GAP is the number of steps in elements from the previous
4522 access (if there is no gap GROUP_GAP is 1). We skip loads that
4523 correspond to the gaps. */
4524 if (next_stmt != first_stmt
4525 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4527 gap_count++;
4528 continue;
4531 while (next_stmt)
4533 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4534 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4535 copies, and we put the new vector statement in the first available
4536 RELATED_STMT. */
4537 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4538 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4539 else
4541 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4543 gimple prev_stmt =
4544 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4545 gimple rel_stmt =
4546 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4547 while (rel_stmt)
4549 prev_stmt = rel_stmt;
4550 rel_stmt =
4551 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4554 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4555 new_stmt;
4559 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4560 gap_count = 1;
4561 /* If NEXT_STMT accesses the same DR as the previous statement,
4562 put the same TMP_DATA_REF as its vectorized statement; otherwise
4563 get the next data-ref from RESULT_CHAIN. */
4564 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4565 break;
4570 /* Function vect_force_dr_alignment_p.
4572 Returns whether the alignment of a DECL can be forced to be aligned
4573 on ALIGNMENT bit boundary. */
4575 bool
4576 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4578 if (TREE_CODE (decl) != VAR_DECL)
4579 return false;
4581 /* We cannot change alignment of common or external symbols as another
4582 translation unit may contain a definition with lower alignment.
4583 The rules of common symbol linking mean that the definition
4584 will override the common symbol. The same is true for constant
4585 pool entries which may be shared and are not properly merged
4586 by LTO. */
4587 if (DECL_EXTERNAL (decl)
4588 || DECL_COMMON (decl)
4589 || DECL_IN_CONSTANT_POOL (decl))
4590 return false;
4592 if (TREE_ASM_WRITTEN (decl))
4593 return false;
4595 /* Do not override the alignment as specified by the ABI when the used
4596 attribute is set. */
4597 if (DECL_PRESERVE_P (decl))
4598 return false;
4600 /* Do not override explicit alignment set by the user when an explicit
4601 section name is also used. This is a common idiom used by many
4602 software projects. */
4603 if (DECL_SECTION_NAME (decl) != NULL_TREE
4604 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4605 return false;
4607 if (TREE_STATIC (decl))
4608 return (alignment <= MAX_OFILE_ALIGNMENT);
4609 else
4610 return (alignment <= MAX_STACK_ALIGNMENT);
4614 /* Return whether the data reference DR is supported with respect to its
4615 alignment.
4616 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4617 it is aligned, i.e., check if it is possible to vectorize it with different
4618 alignment. */
4620 enum dr_alignment_support
4621 vect_supportable_dr_alignment (struct data_reference *dr,
4622 bool check_aligned_accesses)
4624 gimple stmt = DR_STMT (dr);
4625 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4626 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4627 enum machine_mode mode = TYPE_MODE (vectype);
4628 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4629 struct loop *vect_loop = NULL;
4630 bool nested_in_vect_loop = false;
4632 if (aligned_access_p (dr) && !check_aligned_accesses)
4633 return dr_aligned;
4635 if (loop_vinfo)
4637 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4638 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4641 /* Possibly unaligned access. */
4643 /* We can choose between using the implicit realignment scheme (generating
4644 a misaligned_move stmt) and the explicit realignment scheme (generating
4645 aligned loads with a REALIGN_LOAD). There are two variants to the
4646 explicit realignment scheme: optimized, and unoptimized.
4647 We can optimize the realignment only if the step between consecutive
4648 vector loads is equal to the vector size. Since the vector memory
4649 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4650 is guaranteed that the misalignment amount remains the same throughout the
4651 execution of the vectorized loop. Therefore, we can create the
4652 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4653 at the loop preheader.
4655 However, in the case of outer-loop vectorization, when vectorizing a
4656 memory access in the inner-loop nested within the LOOP that is now being
4657 vectorized, while it is guaranteed that the misalignment of the
4658 vectorized memory access will remain the same in different outer-loop
4659 iterations, it is *not* guaranteed that is will remain the same throughout
4660 the execution of the inner-loop. This is because the inner-loop advances
4661 with the original scalar step (and not in steps of VS). If the inner-loop
4662 step happens to be a multiple of VS, then the misalignment remains fixed
4663 and we can use the optimized realignment scheme. For example:
4665 for (i=0; i<N; i++)
4666 for (j=0; j<M; j++)
4667 s += a[i+j];
4669 When vectorizing the i-loop in the above example, the step between
4670 consecutive vector loads is 1, and so the misalignment does not remain
4671 fixed across the execution of the inner-loop, and the realignment cannot
4672 be optimized (as illustrated in the following pseudo vectorized loop):
4674 for (i=0; i<N; i+=4)
4675 for (j=0; j<M; j++){
4676 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4677 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4678 // (assuming that we start from an aligned address).
4681 We therefore have to use the unoptimized realignment scheme:
4683 for (i=0; i<N; i+=4)
4684 for (j=k; j<M; j+=4)
4685 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4686 // that the misalignment of the initial address is
4687 // 0).
4689 The loop can then be vectorized as follows:
4691 for (k=0; k<4; k++){
4692 rt = get_realignment_token (&vp[k]);
4693 for (i=0; i<N; i+=4){
4694 v1 = vp[i+k];
4695 for (j=k; j<M; j+=4){
4696 v2 = vp[i+j+VS-1];
4697 va = REALIGN_LOAD <v1,v2,rt>;
4698 vs += va;
4699 v1 = v2;
4702 } */
4704 if (DR_IS_READ (dr))
4706 bool is_packed = false;
4707 tree type = (TREE_TYPE (DR_REF (dr)));
4709 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4710 && (!targetm.vectorize.builtin_mask_for_load
4711 || targetm.vectorize.builtin_mask_for_load ()))
4713 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4714 if ((nested_in_vect_loop
4715 && (TREE_INT_CST_LOW (DR_STEP (dr))
4716 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4717 || !loop_vinfo)
4718 return dr_explicit_realign;
4719 else
4720 return dr_explicit_realign_optimized;
4722 if (!known_alignment_for_access_p (dr))
4723 is_packed = not_size_aligned (DR_REF (dr));
4725 if (targetm.vectorize.
4726 support_vector_misalignment (mode, type,
4727 DR_MISALIGNMENT (dr), is_packed))
4728 /* Can't software pipeline the loads, but can at least do them. */
4729 return dr_unaligned_supported;
4731 else
4733 bool is_packed = false;
4734 tree type = (TREE_TYPE (DR_REF (dr)));
4736 if (!known_alignment_for_access_p (dr))
4737 is_packed = not_size_aligned (DR_REF (dr));
4739 if (targetm.vectorize.
4740 support_vector_misalignment (mode, type,
4741 DR_MISALIGNMENT (dr), is_packed))
4742 return dr_unaligned_supported;
4745 /* Unsupported. */
4746 return dr_unaligned_unsupported;