* ipa.c (function_and_variable_visibility): First remember function was global
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob4e99747b73c062edffc1635a6a206542d6f63e42
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
42 #include "expr.h"
43 #include "optabs.h"
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
48 static bool
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
52 enum machine_mode mode, array_mode;
53 bool limit_p;
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
58 MODE_INT, limit_p);
60 if (array_mode == BLKmode)
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
66 return false;
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
75 return false;
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
83 return true;
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
102 types. */
104 tree
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
122 if (rhs < lhs)
123 scalar_type = rhs_type;
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
128 return scalar_type;
132 /* Check if data references pointed by DR_I and DR_J are same or
133 belong to same interleaving group. Return FALSE if drs are
134 different, otherwise return TRUE. */
136 static bool
137 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
139 gimple stmt_i = DR_STMT (dr_i);
140 gimple stmt_j = DR_STMT (dr_j);
142 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
143 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
144 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
145 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
146 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
147 return true;
148 else
149 return false;
152 /* If address ranges represented by DDR_I and DDR_J are equal,
153 return TRUE, otherwise return FALSE. */
155 static bool
156 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
158 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
159 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
160 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
161 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
162 return true;
163 else
164 return false;
167 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
168 tested at run-time. Return TRUE if DDR was successfully inserted.
169 Return false if versioning is not supported. */
171 static bool
172 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
174 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
177 return false;
179 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location,
182 "mark for run-time aliasing test between ");
183 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
184 dump_printf (MSG_NOTE, " and ");
185 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
188 if (optimize_loop_nest_for_size_p (loop))
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192 "versioning not supported when optimizing for size.");
193 return false;
196 /* FORNOW: We don't support versioning with outer-loop vectorization. */
197 if (loop->inner)
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
201 "versioning not yet supported for outer-loops.");
202 return false;
205 /* FORNOW: We don't support creating runtime alias tests for non-constant
206 step. */
207 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
208 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
210 if (dump_enabled_p ())
211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
212 "versioning not yet supported for non-constant "
213 "step");
214 return false;
217 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
218 return true;
222 /* Function vect_analyze_data_ref_dependence.
224 Return TRUE if there (might) exist a dependence between a memory-reference
225 DRA and a memory-reference DRB. When versioning for alias may check a
226 dependence at run-time, return FALSE. Adjust *MAX_VF according to
227 the data dependence. */
229 static bool
230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
231 loop_vec_info loop_vinfo, int *max_vf)
233 unsigned int i;
234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
235 struct data_reference *dra = DDR_A (ddr);
236 struct data_reference *drb = DDR_B (ddr);
237 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
238 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
239 lambda_vector dist_v;
240 unsigned int loop_depth;
242 /* In loop analysis all data references should be vectorizable. */
243 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
244 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
245 gcc_unreachable ();
247 /* Independent data accesses. */
248 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
249 return false;
251 if (dra == drb
252 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
253 return false;
255 /* Unknown data dependence. */
256 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
258 if (STMT_VINFO_GATHER_P (stmtinfo_a)
259 || STMT_VINFO_GATHER_P (stmtinfo_b))
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
264 "versioning for alias not supported for: "
265 "can't determine dependence between ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (dra));
268 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (drb));
272 return true;
275 if (dump_enabled_p ())
277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
278 "versioning for alias required: "
279 "can't determine dependence between ");
280 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
281 DR_REF (dra));
282 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (drb));
287 /* Add to list of ddrs that need to be tested at run-time. */
288 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
291 /* Known data dependence. */
292 if (DDR_NUM_DIST_VECTS (ddr) == 0)
294 if (STMT_VINFO_GATHER_P (stmtinfo_a)
295 || STMT_VINFO_GATHER_P (stmtinfo_b))
297 if (dump_enabled_p ())
299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
300 "versioning for alias not supported for: "
301 "bad dist vector for ");
302 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
303 DR_REF (dra));
304 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
306 DR_REF (drb));
308 return true;
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
314 "versioning for alias required: "
315 "bad dist vector for ");
316 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
317 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
320 /* Add to list of ddrs that need to be tested at run-time. */
321 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
324 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
325 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
327 int dist = dist_v[loop_depth];
329 if (dump_enabled_p ())
330 dump_printf_loc (MSG_NOTE, vect_location,
331 "dependence distance = %d.", dist);
333 if (dist == 0)
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location,
338 "dependence distance == 0 between ");
339 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
340 dump_printf (MSG_NOTE, " and ");
341 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
344 /* When we perform grouped accesses and perform implicit CSE
345 by detecting equal accesses and doing disambiguation with
346 runtime alias tests like for
347 .. = a[i];
348 .. = a[i+1];
349 a[i] = ..;
350 a[i+1] = ..;
351 *p = ..;
352 .. = a[i];
353 .. = a[i+1];
354 where we will end up loading { a[i], a[i+1] } once, make
355 sure that inserting group loads before the first load and
356 stores after the last store will do the right thing. */
357 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
358 && GROUP_SAME_DR_STMT (stmtinfo_a))
359 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
360 && GROUP_SAME_DR_STMT (stmtinfo_b)))
362 gimple earlier_stmt;
363 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
364 if (DR_IS_WRITE
365 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
367 if (dump_enabled_p ())
368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
369 "READ_WRITE dependence in interleaving.");
370 return true;
374 continue;
377 if (dist > 0 && DDR_REVERSED_P (ddr))
379 /* If DDR_REVERSED_P the order of the data-refs in DDR was
380 reversed (to make distance vector positive), and the actual
381 distance is negative. */
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "dependence distance negative.");
385 continue;
388 if (abs (dist) >= 2
389 && abs (dist) < *max_vf)
391 /* The dependence distance requires reduction of the maximal
392 vectorization factor. */
393 *max_vf = abs (dist);
394 if (dump_enabled_p ())
395 dump_printf_loc (MSG_NOTE, vect_location,
396 "adjusting maximal vectorization factor to %i",
397 *max_vf);
400 if (abs (dist) >= *max_vf)
402 /* Dependence distance does not create dependence, as far as
403 vectorization is concerned, in this case. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_NOTE, vect_location,
406 "dependence distance >= VF.");
407 continue;
410 if (dump_enabled_p ())
412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
413 "not vectorized, possible dependence "
414 "between data-refs ");
415 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
416 dump_printf (MSG_NOTE, " and ");
417 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
420 return true;
423 return false;
426 /* Function vect_analyze_data_ref_dependences.
428 Examine all the data references in the loop, and make sure there do not
429 exist any data dependences between them. Set *MAX_VF according to
430 the maximum vectorization factor the data dependences allow. */
432 bool
433 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
435 unsigned int i;
436 struct data_dependence_relation *ddr;
438 if (dump_enabled_p ())
439 dump_printf_loc (MSG_NOTE, vect_location,
440 "=== vect_analyze_data_ref_dependences ===");
442 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
443 &LOOP_VINFO_DDRS (loop_vinfo),
444 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
445 return false;
447 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
448 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
449 return false;
451 return true;
455 /* Function vect_slp_analyze_data_ref_dependence.
457 Return TRUE if there (might) exist a dependence between a memory-reference
458 DRA and a memory-reference DRB. When versioning for alias may check a
459 dependence at run-time, return FALSE. Adjust *MAX_VF according to
460 the data dependence. */
462 static bool
463 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
465 struct data_reference *dra = DDR_A (ddr);
466 struct data_reference *drb = DDR_B (ddr);
468 /* We need to check dependences of statements marked as unvectorizable
469 as well, they still can prohibit vectorization. */
471 /* Independent data accesses. */
472 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
473 return false;
475 if (dra == drb)
476 return false;
478 /* Read-read is OK. */
479 if (DR_IS_READ (dra) && DR_IS_READ (drb))
480 return false;
482 /* If dra and drb are part of the same interleaving chain consider
483 them independent. */
484 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
485 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
486 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
487 return false;
489 /* Unknown data dependence. */
490 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
492 gimple earlier_stmt;
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "can't determine dependence between ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
499 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
500 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
503 /* We do not vectorize basic blocks with write-write dependencies. */
504 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
505 return true;
507 /* Check that it's not a load-after-store dependence. */
508 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
509 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
510 return true;
512 return false;
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location,
518 "determined dependence between ");
519 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
520 dump_printf (MSG_NOTE, " and ");
521 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
524 /* Do not vectorize basic blocks with write-write dependences. */
525 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
526 return true;
528 /* Check dependence between DRA and DRB for basic block vectorization.
529 If the accesses share same bases and offsets, we can compare their initial
530 constant offsets to decide whether they differ or not. In case of a read-
531 write dependence we check that the load is before the store to ensure that
532 vectorization will not change the order of the accesses. */
534 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
535 gimple earlier_stmt;
537 /* Check that the data-refs have same bases and offsets. If not, we can't
538 determine if they are dependent. */
539 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
540 || !dr_equal_offsets_p (dra, drb))
541 return true;
543 /* Check the types. */
544 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
545 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
547 if (type_size_a != type_size_b
548 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
549 TREE_TYPE (DR_REF (drb))))
550 return true;
552 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
553 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
555 /* Two different locations - no dependence. */
556 if (init_a != init_b)
557 return false;
559 /* We have a read-write dependence. Check that the load is before the store.
560 When we vectorize basic blocks, vector load can be only before
561 corresponding scalar load, and vector store can be only after its
562 corresponding scalar store. So the order of the acceses is preserved in
563 case the load is before the store. */
564 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
565 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
566 return false;
568 return true;
572 /* Function vect_analyze_data_ref_dependences.
574 Examine all the data references in the basic-block, and make sure there
575 do not exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
578 bool
579 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
581 struct data_dependence_relation *ddr;
582 unsigned int i;
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_NOTE, vect_location,
586 "=== vect_slp_analyze_data_ref_dependences ===");
588 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
589 &BB_VINFO_DDRS (bb_vinfo),
590 vNULL, true))
591 return false;
593 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
594 if (vect_slp_analyze_data_ref_dependence (ddr))
595 return false;
597 return true;
601 /* Function vect_compute_data_ref_alignment
603 Compute the misalignment of the data reference DR.
605 Output:
606 1. If during the misalignment computation it is found that the data reference
607 cannot be vectorized then false is returned.
608 2. DR_MISALIGNMENT (DR) is defined.
610 FOR NOW: No analysis is actually performed. Misalignment is calculated
611 only for trivial cases. TODO. */
613 static bool
614 vect_compute_data_ref_alignment (struct data_reference *dr)
616 gimple stmt = DR_STMT (dr);
617 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
618 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
619 struct loop *loop = NULL;
620 tree ref = DR_REF (dr);
621 tree vectype;
622 tree base, base_addr;
623 bool base_aligned;
624 tree misalign;
625 tree aligned_to, alignment;
627 if (dump_enabled_p ())
628 dump_printf_loc (MSG_NOTE, vect_location,
629 "vect_compute_data_ref_alignment:");
631 if (loop_vinfo)
632 loop = LOOP_VINFO_LOOP (loop_vinfo);
634 /* Initialize misalignment to unknown. */
635 SET_DR_MISALIGNMENT (dr, -1);
637 /* Strided loads perform only component accesses, misalignment information
638 is irrelevant for them. */
639 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
640 return true;
642 misalign = DR_INIT (dr);
643 aligned_to = DR_ALIGNED_TO (dr);
644 base_addr = DR_BASE_ADDRESS (dr);
645 vectype = STMT_VINFO_VECTYPE (stmt_info);
647 /* In case the dataref is in an inner-loop of the loop that is being
648 vectorized (LOOP), we use the base and misalignment information
649 relative to the outer-loop (LOOP). This is ok only if the misalignment
650 stays the same throughout the execution of the inner-loop, which is why
651 we have to check that the stride of the dataref in the inner-loop evenly
652 divides by the vector size. */
653 if (loop && nested_in_vect_loop_p (loop, stmt))
655 tree step = DR_STEP (dr);
656 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
658 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "inner step divides the vector-size.");
663 misalign = STMT_VINFO_DR_INIT (stmt_info);
664 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
665 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
667 else
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
671 "inner step doesn't divide the vector-size.");
672 misalign = NULL_TREE;
676 /* Similarly, if we're doing basic-block vectorization, we can only use
677 base and misalignment information relative to an innermost loop if the
678 misalignment stays the same throughout the execution of the loop.
679 As above, this is the case if the stride of the dataref evenly divides
680 by the vector size. */
681 if (!loop)
683 tree step = DR_STEP (dr);
684 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
686 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "SLP: step doesn't divide the vector-size.");
691 misalign = NULL_TREE;
695 base = build_fold_indirect_ref (base_addr);
696 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
698 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
699 || !misalign)
701 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Unknown alignment for access: ");
705 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
707 return true;
710 if ((DECL_P (base)
711 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
712 alignment) >= 0)
713 || (TREE_CODE (base_addr) == SSA_NAME
714 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
715 TREE_TYPE (base_addr)))),
716 alignment) >= 0)
717 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
718 base_aligned = true;
719 else
720 base_aligned = false;
722 if (!base_aligned)
724 /* Do not change the alignment of global variables here if
725 flag_section_anchors is enabled as we already generated
726 RTL for other functions. Most global variables should
727 have been aligned during the IPA increase_alignment pass. */
728 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
729 || (TREE_STATIC (base) && flag_section_anchors))
731 if (dump_enabled_p ())
733 dump_printf_loc (MSG_NOTE, vect_location,
734 "can't force alignment of ref: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
737 return true;
740 /* Force the alignment of the decl.
741 NOTE: This is the only change to the code we make during
742 the analysis phase, before deciding to vectorize the loop. */
743 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
746 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
749 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
750 DECL_USER_ALIGN (base) = 1;
753 /* At this point we assume that the base is aligned. */
754 gcc_assert (base_aligned
755 || (TREE_CODE (base) == VAR_DECL
756 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
758 /* If this is a backward running DR then first access in the larger
759 vectype actually is N-1 elements before the address in the DR.
760 Adjust misalign accordingly. */
761 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
763 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
764 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
765 otherwise we wouldn't be here. */
766 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
767 /* PLUS because DR_STEP was negative. */
768 misalign = size_binop (PLUS_EXPR, misalign, offset);
771 /* Modulo alignment. */
772 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
774 if (!host_integerp (misalign, 1))
776 /* Negative or overflowed misalignment value. */
777 if (dump_enabled_p ())
778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
779 "unexpected misalign value");
780 return false;
783 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
788 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
789 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
792 return true;
796 /* Function vect_compute_data_refs_alignment
798 Compute the misalignment of data references in the loop.
799 Return FALSE if a data reference is found that cannot be vectorized. */
801 static bool
802 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
803 bb_vec_info bb_vinfo)
805 vec<data_reference_p> datarefs;
806 struct data_reference *dr;
807 unsigned int i;
809 if (loop_vinfo)
810 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
811 else
812 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
814 FOR_EACH_VEC_ELT (datarefs, i, dr)
815 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
816 && !vect_compute_data_ref_alignment (dr))
818 if (bb_vinfo)
820 /* Mark unsupported statement as unvectorizable. */
821 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
822 continue;
824 else
825 return false;
828 return true;
832 /* Function vect_update_misalignment_for_peel
834 DR - the data reference whose misalignment is to be adjusted.
835 DR_PEEL - the data reference whose misalignment is being made
836 zero in the vector loop by the peel.
837 NPEEL - the number of iterations in the peel loop if the misalignment
838 of DR_PEEL is known at compile time. */
840 static void
841 vect_update_misalignment_for_peel (struct data_reference *dr,
842 struct data_reference *dr_peel, int npeel)
844 unsigned int i;
845 vec<dr_p> same_align_drs;
846 struct data_reference *current_dr;
847 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
848 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
849 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
850 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
852 /* For interleaved data accesses the step in the loop must be multiplied by
853 the size of the interleaving group. */
854 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
855 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
856 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
857 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
859 /* It can be assumed that the data refs with the same alignment as dr_peel
860 are aligned in the vector loop. */
861 same_align_drs
862 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
863 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
865 if (current_dr != dr)
866 continue;
867 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
868 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
869 SET_DR_MISALIGNMENT (dr, 0);
870 return;
873 if (known_alignment_for_access_p (dr)
874 && known_alignment_for_access_p (dr_peel))
876 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
877 int misal = DR_MISALIGNMENT (dr);
878 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
879 misal += negative ? -npeel * dr_size : npeel * dr_size;
880 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
881 SET_DR_MISALIGNMENT (dr, misal);
882 return;
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
887 SET_DR_MISALIGNMENT (dr, -1);
891 /* Function vect_verify_datarefs_alignment
893 Return TRUE if all data references in the loop can be
894 handled with respect to alignment. */
896 bool
897 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
899 vec<data_reference_p> datarefs;
900 struct data_reference *dr;
901 enum dr_alignment_support supportable_dr_alignment;
902 unsigned int i;
904 if (loop_vinfo)
905 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
906 else
907 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
909 FOR_EACH_VEC_ELT (datarefs, i, dr)
911 gimple stmt = DR_STMT (dr);
912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
914 if (!STMT_VINFO_RELEVANT_P (stmt_info))
915 continue;
917 /* For interleaving, only the alignment of the first access matters.
918 Skip statements marked as not vectorizable. */
919 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
920 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
921 || !STMT_VINFO_VECTORIZABLE (stmt_info))
922 continue;
924 /* Strided loads perform only component accesses, alignment is
925 irrelevant for them. */
926 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
927 continue;
929 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
930 if (!supportable_dr_alignment)
932 if (dump_enabled_p ())
934 if (DR_IS_READ (dr))
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "not vectorized: unsupported unaligned load.");
937 else
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "not vectorized: unsupported unaligned "
940 "store.");
942 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
943 DR_REF (dr));
945 return false;
947 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
948 dump_printf_loc (MSG_NOTE, vect_location,
949 "Vectorizing an unaligned access.");
951 return true;
954 /* Given an memory reference EXP return whether its alignment is less
955 than its size. */
957 static bool
958 not_size_aligned (tree exp)
960 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
961 return true;
963 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
964 > get_object_alignment (exp));
967 /* Function vector_alignment_reachable_p
969 Return true if vector alignment for DR is reachable by peeling
970 a few loop iterations. Return false otherwise. */
972 static bool
973 vector_alignment_reachable_p (struct data_reference *dr)
975 gimple stmt = DR_STMT (dr);
976 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
981 /* For interleaved access we peel only if number of iterations in
982 the prolog loop ({VF - misalignment}), is a multiple of the
983 number of the interleaved accesses. */
984 int elem_size, mis_in_elements;
985 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
987 /* FORNOW: handle only known alignment. */
988 if (!known_alignment_for_access_p (dr))
989 return false;
991 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
992 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
994 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
995 return false;
998 /* If misalignment is known at the compile time then allow peeling
999 only if natural alignment is reachable through peeling. */
1000 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1002 HOST_WIDE_INT elmsize =
1003 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_NOTE, vect_location,
1007 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1008 dump_printf (MSG_NOTE,
1009 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1011 if (DR_MISALIGNMENT (dr) % elmsize)
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "data size does not divide the misalignment.\n");
1016 return false;
1020 if (!known_alignment_for_access_p (dr))
1022 tree type = TREE_TYPE (DR_REF (dr));
1023 bool is_packed = not_size_aligned (DR_REF (dr));
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Unknown misalignment, is_packed = %d",is_packed);
1027 if ((TYPE_USER_ALIGN (type) && !is_packed)
1028 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1029 return true;
1030 else
1031 return false;
1034 return true;
1038 /* Calculate the cost of the memory access represented by DR. */
1040 static void
1041 vect_get_data_access_cost (struct data_reference *dr,
1042 unsigned int *inside_cost,
1043 unsigned int *outside_cost,
1044 stmt_vector_for_cost *body_cost_vec)
1046 gimple stmt = DR_STMT (dr);
1047 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1048 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1049 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1050 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1051 int ncopies = vf / nunits;
1053 if (DR_IS_READ (dr))
1054 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1055 NULL, body_cost_vec, false);
1056 else
1057 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_NOTE, vect_location,
1061 "vect_get_data_access_cost: inside_cost = %d, "
1062 "outside_cost = %d.", *inside_cost, *outside_cost);
1066 /* Insert DR into peeling hash table with NPEEL as key. */
1068 static void
1069 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1070 int npeel)
1072 struct _vect_peel_info elem, *slot;
1073 _vect_peel_info **new_slot;
1074 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1076 elem.npeel = npeel;
1077 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1078 if (slot)
1079 slot->count++;
1080 else
1082 slot = XNEW (struct _vect_peel_info);
1083 slot->npeel = npeel;
1084 slot->dr = dr;
1085 slot->count = 1;
1086 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1087 *new_slot = slot;
1090 if (!supportable_dr_alignment && !flag_vect_cost_model)
1091 slot->count += VECT_MAX_COST;
1095 /* Traverse peeling hash table to find peeling option that aligns maximum
1096 number of data accesses. */
1099 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1100 _vect_peel_extended_info *max)
1102 vect_peel_info elem = *slot;
1104 if (elem->count > max->peel_info.count
1105 || (elem->count == max->peel_info.count
1106 && max->peel_info.npeel > elem->npeel))
1108 max->peel_info.npeel = elem->npeel;
1109 max->peel_info.count = elem->count;
1110 max->peel_info.dr = elem->dr;
1113 return 1;
1117 /* Traverse peeling hash table and calculate cost for each peeling option.
1118 Find the one with the lowest cost. */
1121 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1122 _vect_peel_extended_info *min)
1124 vect_peel_info elem = *slot;
1125 int save_misalignment, dummy;
1126 unsigned int inside_cost = 0, outside_cost = 0, i;
1127 gimple stmt = DR_STMT (elem->dr);
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1129 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1130 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1131 struct data_reference *dr;
1132 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1133 int single_iter_cost;
1135 prologue_cost_vec.create (2);
1136 body_cost_vec.create (2);
1137 epilogue_cost_vec.create (2);
1139 FOR_EACH_VEC_ELT (datarefs, i, dr)
1141 stmt = DR_STMT (dr);
1142 stmt_info = vinfo_for_stmt (stmt);
1143 /* For interleaving, only the alignment of the first access
1144 matters. */
1145 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1146 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1147 continue;
1149 save_misalignment = DR_MISALIGNMENT (dr);
1150 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1151 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1152 &body_cost_vec);
1153 SET_DR_MISALIGNMENT (dr, save_misalignment);
1156 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1157 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1158 &dummy, single_iter_cost,
1159 &prologue_cost_vec,
1160 &epilogue_cost_vec);
1162 /* Prologue and epilogue costs are added to the target model later.
1163 These costs depend only on the scalar iteration cost, the
1164 number of peeling iterations finally chosen, and the number of
1165 misaligned statements. So discard the information found here. */
1166 prologue_cost_vec.release ();
1167 epilogue_cost_vec.release ();
1169 if (inside_cost < min->inside_cost
1170 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1172 min->inside_cost = inside_cost;
1173 min->outside_cost = outside_cost;
1174 min->body_cost_vec.release ();
1175 min->body_cost_vec = body_cost_vec;
1176 min->peel_info.dr = elem->dr;
1177 min->peel_info.npeel = elem->npeel;
1179 else
1180 body_cost_vec.release ();
1182 return 1;
1186 /* Choose best peeling option by traversing peeling hash table and either
1187 choosing an option with the lowest cost (if cost model is enabled) or the
1188 option that aligns as many accesses as possible. */
1190 static struct data_reference *
1191 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1192 unsigned int *npeel,
1193 stmt_vector_for_cost *body_cost_vec)
1195 struct _vect_peel_extended_info res;
1197 res.peel_info.dr = NULL;
1198 res.body_cost_vec = stmt_vector_for_cost();
1200 if (flag_vect_cost_model)
1202 res.inside_cost = INT_MAX;
1203 res.outside_cost = INT_MAX;
1204 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1205 .traverse <_vect_peel_extended_info *,
1206 vect_peeling_hash_get_lowest_cost> (&res);
1208 else
1210 res.peel_info.count = 0;
1211 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1212 .traverse <_vect_peel_extended_info *,
1213 vect_peeling_hash_get_most_frequent> (&res);
1216 *npeel = res.peel_info.npeel;
1217 *body_cost_vec = res.body_cost_vec;
1218 return res.peel_info.dr;
1222 /* Function vect_enhance_data_refs_alignment
1224 This pass will use loop versioning and loop peeling in order to enhance
1225 the alignment of data references in the loop.
1227 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1228 original loop is to be vectorized. Any other loops that are created by
1229 the transformations performed in this pass - are not supposed to be
1230 vectorized. This restriction will be relaxed.
1232 This pass will require a cost model to guide it whether to apply peeling
1233 or versioning or a combination of the two. For example, the scheme that
1234 intel uses when given a loop with several memory accesses, is as follows:
1235 choose one memory access ('p') which alignment you want to force by doing
1236 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1237 other accesses are not necessarily aligned, or (2) use loop versioning to
1238 generate one loop in which all accesses are aligned, and another loop in
1239 which only 'p' is necessarily aligned.
1241 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1242 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1243 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1245 Devising a cost model is the most critical aspect of this work. It will
1246 guide us on which access to peel for, whether to use loop versioning, how
1247 many versions to create, etc. The cost model will probably consist of
1248 generic considerations as well as target specific considerations (on
1249 powerpc for example, misaligned stores are more painful than misaligned
1250 loads).
1252 Here are the general steps involved in alignment enhancements:
1254 -- original loop, before alignment analysis:
1255 for (i=0; i<N; i++){
1256 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1257 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1260 -- After vect_compute_data_refs_alignment:
1261 for (i=0; i<N; i++){
1262 x = q[i]; # DR_MISALIGNMENT(q) = 3
1263 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1266 -- Possibility 1: we do loop versioning:
1267 if (p is aligned) {
1268 for (i=0; i<N; i++){ # loop 1A
1269 x = q[i]; # DR_MISALIGNMENT(q) = 3
1270 p[i] = y; # DR_MISALIGNMENT(p) = 0
1273 else {
1274 for (i=0; i<N; i++){ # loop 1B
1275 x = q[i]; # DR_MISALIGNMENT(q) = 3
1276 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1280 -- Possibility 2: we do loop peeling:
1281 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1282 x = q[i];
1283 p[i] = y;
1285 for (i = 3; i < N; i++){ # loop 2A
1286 x = q[i]; # DR_MISALIGNMENT(q) = 0
1287 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1290 -- Possibility 3: combination of loop peeling and versioning:
1291 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1292 x = q[i];
1293 p[i] = y;
1295 if (p is aligned) {
1296 for (i = 3; i<N; i++){ # loop 3A
1297 x = q[i]; # DR_MISALIGNMENT(q) = 0
1298 p[i] = y; # DR_MISALIGNMENT(p) = 0
1301 else {
1302 for (i = 3; i<N; i++){ # loop 3B
1303 x = q[i]; # DR_MISALIGNMENT(q) = 0
1304 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1308 These loops are later passed to loop_transform to be vectorized. The
1309 vectorizer will use the alignment information to guide the transformation
1310 (whether to generate regular loads/stores, or with special handling for
1311 misalignment). */
1313 bool
1314 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1316 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1317 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1318 enum dr_alignment_support supportable_dr_alignment;
1319 struct data_reference *dr0 = NULL, *first_store = NULL;
1320 struct data_reference *dr;
1321 unsigned int i, j;
1322 bool do_peeling = false;
1323 bool do_versioning = false;
1324 bool stat;
1325 gimple stmt;
1326 stmt_vec_info stmt_info;
1327 unsigned int npeel = 0;
1328 bool all_misalignments_unknown = true;
1329 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1330 unsigned possible_npeel_number = 1;
1331 tree vectype;
1332 unsigned int nelements, mis, same_align_drs_max = 0;
1333 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1335 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_NOTE, vect_location,
1337 "=== vect_enhance_data_refs_alignment ===");
1339 /* While cost model enhancements are expected in the future, the high level
1340 view of the code at this time is as follows:
1342 A) If there is a misaligned access then see if peeling to align
1343 this access can make all data references satisfy
1344 vect_supportable_dr_alignment. If so, update data structures
1345 as needed and return true.
1347 B) If peeling wasn't possible and there is a data reference with an
1348 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1349 then see if loop versioning checks can be used to make all data
1350 references satisfy vect_supportable_dr_alignment. If so, update
1351 data structures as needed and return true.
1353 C) If neither peeling nor versioning were successful then return false if
1354 any data reference does not satisfy vect_supportable_dr_alignment.
1356 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1358 Note, Possibility 3 above (which is peeling and versioning together) is not
1359 being done at this time. */
1361 /* (1) Peeling to force alignment. */
1363 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1364 Considerations:
1365 + How many accesses will become aligned due to the peeling
1366 - How many accesses will become unaligned due to the peeling,
1367 and the cost of misaligned accesses.
1368 - The cost of peeling (the extra runtime checks, the increase
1369 in code size). */
1371 FOR_EACH_VEC_ELT (datarefs, i, dr)
1373 stmt = DR_STMT (dr);
1374 stmt_info = vinfo_for_stmt (stmt);
1376 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1377 continue;
1379 /* For interleaving, only the alignment of the first access
1380 matters. */
1381 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1382 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1383 continue;
1385 /* For invariant accesses there is nothing to enhance. */
1386 if (integer_zerop (DR_STEP (dr)))
1387 continue;
1389 /* Strided loads perform only component accesses, alignment is
1390 irrelevant for them. */
1391 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1392 continue;
1394 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1395 do_peeling = vector_alignment_reachable_p (dr);
1396 if (do_peeling)
1398 if (known_alignment_for_access_p (dr))
1400 unsigned int npeel_tmp;
1401 bool negative = tree_int_cst_compare (DR_STEP (dr),
1402 size_zero_node) < 0;
1404 /* Save info about DR in the hash table. */
1405 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1406 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1408 vectype = STMT_VINFO_VECTYPE (stmt_info);
1409 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1410 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1411 TREE_TYPE (DR_REF (dr))));
1412 npeel_tmp = (negative
1413 ? (mis - nelements) : (nelements - mis))
1414 & (nelements - 1);
1416 /* For multiple types, it is possible that the bigger type access
1417 will have more than one peeling option. E.g., a loop with two
1418 types: one of size (vector size / 4), and the other one of
1419 size (vector size / 8). Vectorization factor will 8. If both
1420 access are misaligned by 3, the first one needs one scalar
1421 iteration to be aligned, and the second one needs 5. But the
1422 the first one will be aligned also by peeling 5 scalar
1423 iterations, and in that case both accesses will be aligned.
1424 Hence, except for the immediate peeling amount, we also want
1425 to try to add full vector size, while we don't exceed
1426 vectorization factor.
1427 We do this automtically for cost model, since we calculate cost
1428 for every peeling option. */
1429 if (!flag_vect_cost_model)
1430 possible_npeel_number = vf /nelements;
1432 /* Handle the aligned case. We may decide to align some other
1433 access, making DR unaligned. */
1434 if (DR_MISALIGNMENT (dr) == 0)
1436 npeel_tmp = 0;
1437 if (!flag_vect_cost_model)
1438 possible_npeel_number++;
1441 for (j = 0; j < possible_npeel_number; j++)
1443 gcc_assert (npeel_tmp <= vf);
1444 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1445 npeel_tmp += nelements;
1448 all_misalignments_unknown = false;
1449 /* Data-ref that was chosen for the case that all the
1450 misalignments are unknown is not relevant anymore, since we
1451 have a data-ref with known alignment. */
1452 dr0 = NULL;
1454 else
1456 /* If we don't know any misalignment values, we prefer
1457 peeling for data-ref that has the maximum number of data-refs
1458 with the same alignment, unless the target prefers to align
1459 stores over load. */
1460 if (all_misalignments_unknown)
1462 unsigned same_align_drs
1463 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1464 if (!dr0
1465 || same_align_drs_max < same_align_drs)
1467 same_align_drs_max = same_align_drs;
1468 dr0 = dr;
1470 /* For data-refs with the same number of related
1471 accesses prefer the one where the misalign
1472 computation will be invariant in the outermost loop. */
1473 else if (same_align_drs_max == same_align_drs)
1475 struct loop *ivloop0, *ivloop;
1476 ivloop0 = outermost_invariant_loop_for_expr
1477 (loop, DR_BASE_ADDRESS (dr0));
1478 ivloop = outermost_invariant_loop_for_expr
1479 (loop, DR_BASE_ADDRESS (dr));
1480 if ((ivloop && !ivloop0)
1481 || (ivloop && ivloop0
1482 && flow_loop_nested_p (ivloop, ivloop0)))
1483 dr0 = dr;
1486 if (!first_store && DR_IS_WRITE (dr))
1487 first_store = dr;
1490 /* If there are both known and unknown misaligned accesses in the
1491 loop, we choose peeling amount according to the known
1492 accesses. */
1493 if (!supportable_dr_alignment)
1495 dr0 = dr;
1496 if (!first_store && DR_IS_WRITE (dr))
1497 first_store = dr;
1501 else
1503 if (!aligned_access_p (dr))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "vector alignment may not be reachable");
1508 break;
1513 /* Check if we can possibly peel the loop. */
1514 if (!vect_can_advance_ivs_p (loop_vinfo)
1515 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1516 do_peeling = false;
1518 if (do_peeling && all_misalignments_unknown
1519 && vect_supportable_dr_alignment (dr0, false))
1522 /* Check if the target requires to prefer stores over loads, i.e., if
1523 misaligned stores are more expensive than misaligned loads (taking
1524 drs with same alignment into account). */
1525 if (first_store && DR_IS_READ (dr0))
1527 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1528 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1529 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1530 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1531 stmt_vector_for_cost dummy;
1532 dummy.create (2);
1534 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1535 &dummy);
1536 vect_get_data_access_cost (first_store, &store_inside_cost,
1537 &store_outside_cost, &dummy);
1539 dummy.release ();
1541 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1542 aligning the load DR0). */
1543 load_inside_penalty = store_inside_cost;
1544 load_outside_penalty = store_outside_cost;
1545 for (i = 0;
1546 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1547 DR_STMT (first_store))).iterate (i, &dr);
1548 i++)
1549 if (DR_IS_READ (dr))
1551 load_inside_penalty += load_inside_cost;
1552 load_outside_penalty += load_outside_cost;
1554 else
1556 load_inside_penalty += store_inside_cost;
1557 load_outside_penalty += store_outside_cost;
1560 /* Calculate the penalty for leaving DR0 unaligned (by
1561 aligning the FIRST_STORE). */
1562 store_inside_penalty = load_inside_cost;
1563 store_outside_penalty = load_outside_cost;
1564 for (i = 0;
1565 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1566 DR_STMT (dr0))).iterate (i, &dr);
1567 i++)
1568 if (DR_IS_READ (dr))
1570 store_inside_penalty += load_inside_cost;
1571 store_outside_penalty += load_outside_cost;
1573 else
1575 store_inside_penalty += store_inside_cost;
1576 store_outside_penalty += store_outside_cost;
1579 if (load_inside_penalty > store_inside_penalty
1580 || (load_inside_penalty == store_inside_penalty
1581 && load_outside_penalty > store_outside_penalty))
1582 dr0 = first_store;
1585 /* In case there are only loads with different unknown misalignments, use
1586 peeling only if it may help to align other accesses in the loop. */
1587 if (!first_store
1588 && !STMT_VINFO_SAME_ALIGN_REFS (
1589 vinfo_for_stmt (DR_STMT (dr0))).length ()
1590 && vect_supportable_dr_alignment (dr0, false)
1591 != dr_unaligned_supported)
1592 do_peeling = false;
1595 if (do_peeling && !dr0)
1597 /* Peeling is possible, but there is no data access that is not supported
1598 unless aligned. So we try to choose the best possible peeling. */
1600 /* We should get here only if there are drs with known misalignment. */
1601 gcc_assert (!all_misalignments_unknown);
1603 /* Choose the best peeling from the hash table. */
1604 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1605 &body_cost_vec);
1606 if (!dr0 || !npeel)
1607 do_peeling = false;
1610 if (do_peeling)
1612 stmt = DR_STMT (dr0);
1613 stmt_info = vinfo_for_stmt (stmt);
1614 vectype = STMT_VINFO_VECTYPE (stmt_info);
1615 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1617 if (known_alignment_for_access_p (dr0))
1619 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1620 size_zero_node) < 0;
1621 if (!npeel)
1623 /* Since it's known at compile time, compute the number of
1624 iterations in the peeled loop (the peeling factor) for use in
1625 updating DR_MISALIGNMENT values. The peeling factor is the
1626 vectorization factor minus the misalignment as an element
1627 count. */
1628 mis = DR_MISALIGNMENT (dr0);
1629 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1630 npeel = ((negative ? mis - nelements : nelements - mis)
1631 & (nelements - 1));
1634 /* For interleaved data access every iteration accesses all the
1635 members of the group, therefore we divide the number of iterations
1636 by the group size. */
1637 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1638 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1639 npeel /= GROUP_SIZE (stmt_info);
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "Try peeling by %d", npeel);
1646 /* Ensure that all data refs can be vectorized after the peel. */
1647 FOR_EACH_VEC_ELT (datarefs, i, dr)
1649 int save_misalignment;
1651 if (dr == dr0)
1652 continue;
1654 stmt = DR_STMT (dr);
1655 stmt_info = vinfo_for_stmt (stmt);
1656 /* For interleaving, only the alignment of the first access
1657 matters. */
1658 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1659 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1660 continue;
1662 /* Strided loads perform only component accesses, alignment is
1663 irrelevant for them. */
1664 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1665 continue;
1667 save_misalignment = DR_MISALIGNMENT (dr);
1668 vect_update_misalignment_for_peel (dr, dr0, npeel);
1669 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1670 SET_DR_MISALIGNMENT (dr, save_misalignment);
1672 if (!supportable_dr_alignment)
1674 do_peeling = false;
1675 break;
1679 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1681 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1682 if (!stat)
1683 do_peeling = false;
1684 else
1686 body_cost_vec.release ();
1687 return stat;
1691 if (do_peeling)
1693 stmt_info_for_cost *si;
1694 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1696 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1697 If the misalignment of DR_i is identical to that of dr0 then set
1698 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1699 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1700 by the peeling factor times the element size of DR_i (MOD the
1701 vectorization factor times the size). Otherwise, the
1702 misalignment of DR_i must be set to unknown. */
1703 FOR_EACH_VEC_ELT (datarefs, i, dr)
1704 if (dr != dr0)
1705 vect_update_misalignment_for_peel (dr, dr0, npeel);
1707 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1708 if (npeel)
1709 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1710 else
1711 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1712 SET_DR_MISALIGNMENT (dr0, 0);
1713 if (dump_enabled_p ())
1715 dump_printf_loc (MSG_NOTE, vect_location,
1716 "Alignment of access forced using peeling.");
1717 dump_printf_loc (MSG_NOTE, vect_location,
1718 "Peeling for alignment will be applied.");
1720 /* We've delayed passing the inside-loop peeling costs to the
1721 target cost model until we were sure peeling would happen.
1722 Do so now. */
1723 if (body_cost_vec.exists ())
1725 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1727 struct _stmt_vec_info *stmt_info
1728 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1729 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1730 si->misalign, vect_body);
1732 body_cost_vec.release ();
1735 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1736 gcc_assert (stat);
1737 return stat;
1741 body_cost_vec.release ();
1743 /* (2) Versioning to force alignment. */
1745 /* Try versioning if:
1746 1) flag_tree_vect_loop_version is TRUE
1747 2) optimize loop for speed
1748 3) there is at least one unsupported misaligned data ref with an unknown
1749 misalignment, and
1750 4) all misaligned data refs with a known misalignment are supported, and
1751 5) the number of runtime alignment checks is within reason. */
1753 do_versioning =
1754 flag_tree_vect_loop_version
1755 && optimize_loop_nest_for_speed_p (loop)
1756 && (!loop->inner); /* FORNOW */
1758 if (do_versioning)
1760 FOR_EACH_VEC_ELT (datarefs, i, dr)
1762 stmt = DR_STMT (dr);
1763 stmt_info = vinfo_for_stmt (stmt);
1765 /* For interleaving, only the alignment of the first access
1766 matters. */
1767 if (aligned_access_p (dr)
1768 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1769 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1770 continue;
1772 /* Strided loads perform only component accesses, alignment is
1773 irrelevant for them. */
1774 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1775 continue;
1777 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1779 if (!supportable_dr_alignment)
1781 gimple stmt;
1782 int mask;
1783 tree vectype;
1785 if (known_alignment_for_access_p (dr)
1786 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1787 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1789 do_versioning = false;
1790 break;
1793 stmt = DR_STMT (dr);
1794 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1795 gcc_assert (vectype);
1797 /* The rightmost bits of an aligned address must be zeros.
1798 Construct the mask needed for this test. For example,
1799 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1800 mask must be 15 = 0xf. */
1801 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1803 /* FORNOW: use the same mask to test all potentially unaligned
1804 references in the loop. The vectorizer currently supports
1805 a single vector size, see the reference to
1806 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1807 vectorization factor is computed. */
1808 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1809 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1810 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1811 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1812 DR_STMT (dr));
1816 /* Versioning requires at least one misaligned data reference. */
1817 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1818 do_versioning = false;
1819 else if (!do_versioning)
1820 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1823 if (do_versioning)
1825 vec<gimple> may_misalign_stmts
1826 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1827 gimple stmt;
1829 /* It can now be assumed that the data references in the statements
1830 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1831 of the loop being vectorized. */
1832 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1834 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1835 dr = STMT_VINFO_DATA_REF (stmt_info);
1836 SET_DR_MISALIGNMENT (dr, 0);
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_NOTE, vect_location,
1839 "Alignment of access forced using versioning.");
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_NOTE, vect_location,
1844 "Versioning for alignment will be applied.");
1846 /* Peeling and versioning can't be done together at this time. */
1847 gcc_assert (! (do_peeling && do_versioning));
1849 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1850 gcc_assert (stat);
1851 return stat;
1854 /* This point is reached if neither peeling nor versioning is being done. */
1855 gcc_assert (! (do_peeling || do_versioning));
1857 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1858 return stat;
1862 /* Function vect_find_same_alignment_drs.
1864 Update group and alignment relations according to the chosen
1865 vectorization factor. */
1867 static void
1868 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1869 loop_vec_info loop_vinfo)
1871 unsigned int i;
1872 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1873 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1874 struct data_reference *dra = DDR_A (ddr);
1875 struct data_reference *drb = DDR_B (ddr);
1876 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1877 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1878 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1879 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1880 lambda_vector dist_v;
1881 unsigned int loop_depth;
1883 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1884 return;
1886 if (dra == drb)
1887 return;
1889 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1890 return;
1892 /* Loop-based vectorization and known data dependence. */
1893 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1894 return;
1896 /* Data-dependence analysis reports a distance vector of zero
1897 for data-references that overlap only in the first iteration
1898 but have different sign step (see PR45764).
1899 So as a sanity check require equal DR_STEP. */
1900 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1901 return;
1903 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1904 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1906 int dist = dist_v[loop_depth];
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_NOTE, vect_location,
1910 "dependence distance = %d.", dist);
1912 /* Same loop iteration. */
1913 if (dist == 0
1914 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1916 /* Two references with distance zero have the same alignment. */
1917 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1918 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1919 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location,
1922 "accesses have the same alignment.");
1923 dump_printf (MSG_NOTE,
1924 "dependence distance modulo vf == 0 between ");
1925 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1926 dump_printf (MSG_NOTE, " and ");
1927 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1934 /* Function vect_analyze_data_refs_alignment
1936 Analyze the alignment of the data-references in the loop.
1937 Return FALSE if a data reference is found that cannot be vectorized. */
1939 bool
1940 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1941 bb_vec_info bb_vinfo)
1943 if (dump_enabled_p ())
1944 dump_printf_loc (MSG_NOTE, vect_location,
1945 "=== vect_analyze_data_refs_alignment ===");
1947 /* Mark groups of data references with same alignment using
1948 data dependence information. */
1949 if (loop_vinfo)
1951 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1952 struct data_dependence_relation *ddr;
1953 unsigned int i;
1955 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1956 vect_find_same_alignment_drs (ddr, loop_vinfo);
1959 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1961 if (dump_enabled_p ())
1962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1963 "not vectorized: can't calculate alignment "
1964 "for data ref.");
1965 return false;
1968 return true;
1972 /* Analyze groups of accesses: check that DR belongs to a group of
1973 accesses of legal size, step, etc. Detect gaps, single element
1974 interleaving, and other special cases. Set grouped access info.
1975 Collect groups of strided stores for further use in SLP analysis. */
1977 static bool
1978 vect_analyze_group_access (struct data_reference *dr)
1980 tree step = DR_STEP (dr);
1981 tree scalar_type = TREE_TYPE (DR_REF (dr));
1982 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1983 gimple stmt = DR_STMT (dr);
1984 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1985 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1986 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1987 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1988 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1989 bool slp_impossible = false;
1990 struct loop *loop = NULL;
1992 if (loop_vinfo)
1993 loop = LOOP_VINFO_LOOP (loop_vinfo);
1995 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
1996 size of the interleaving group (including gaps). */
1997 groupsize = absu_hwi (dr_step) / type_size;
1999 /* Not consecutive access is possible only if it is a part of interleaving. */
2000 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2002 /* Check if it this DR is a part of interleaving, and is a single
2003 element of the group that is accessed in the loop. */
2005 /* Gaps are supported only for loads. STEP must be a multiple of the type
2006 size. The size of the group must be a power of 2. */
2007 if (DR_IS_READ (dr)
2008 && (dr_step % type_size) == 0
2009 && groupsize > 0
2010 && exact_log2 (groupsize) != -1)
2012 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2013 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2014 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_NOTE, vect_location,
2017 "Detected single element interleaving ");
2018 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2019 dump_printf (MSG_NOTE, " step ");
2020 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2023 if (loop_vinfo)
2025 if (dump_enabled_p ())
2026 dump_printf_loc (MSG_NOTE, vect_location,
2027 "Data access with gaps requires scalar "
2028 "epilogue loop");
2029 if (loop->inner)
2031 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033 "Peeling for outer loop is not"
2034 " supported");
2035 return false;
2038 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2041 return true;
2044 if (dump_enabled_p ())
2046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047 "not consecutive access ");
2048 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2051 if (bb_vinfo)
2053 /* Mark the statement as unvectorizable. */
2054 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2055 return true;
2058 return false;
2061 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2063 /* First stmt in the interleaving chain. Check the chain. */
2064 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2065 struct data_reference *data_ref = dr;
2066 unsigned int count = 1;
2067 tree prev_init = DR_INIT (data_ref);
2068 gimple prev = stmt;
2069 HOST_WIDE_INT diff, gaps = 0;
2070 unsigned HOST_WIDE_INT count_in_bytes;
2072 while (next)
2074 /* Skip same data-refs. In case that two or more stmts share
2075 data-ref (supported only for loads), we vectorize only the first
2076 stmt, and the rest get their vectorized loads from the first
2077 one. */
2078 if (!tree_int_cst_compare (DR_INIT (data_ref),
2079 DR_INIT (STMT_VINFO_DATA_REF (
2080 vinfo_for_stmt (next)))))
2082 if (DR_IS_WRITE (data_ref))
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2086 "Two store stmts share the same dr.");
2087 return false;
2090 /* For load use the same data-ref load. */
2091 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2093 prev = next;
2094 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2095 continue;
2098 prev = next;
2099 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2101 /* All group members have the same STEP by construction. */
2102 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2104 /* Check that the distance between two accesses is equal to the type
2105 size. Otherwise, we have gaps. */
2106 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2107 - TREE_INT_CST_LOW (prev_init)) / type_size;
2108 if (diff != 1)
2110 /* FORNOW: SLP of accesses with gaps is not supported. */
2111 slp_impossible = true;
2112 if (DR_IS_WRITE (data_ref))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "interleaved store with gaps");
2117 return false;
2120 gaps += diff - 1;
2123 last_accessed_element += diff;
2125 /* Store the gap from the previous member of the group. If there is no
2126 gap in the access, GROUP_GAP is always 1. */
2127 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2129 prev_init = DR_INIT (data_ref);
2130 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2131 /* Count the number of data-refs in the chain. */
2132 count++;
2135 /* COUNT is the number of accesses found, we multiply it by the size of
2136 the type to get COUNT_IN_BYTES. */
2137 count_in_bytes = type_size * count;
2139 /* Check that the size of the interleaving (including gaps) is not
2140 greater than STEP. */
2141 if (dr_step != 0
2142 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2144 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "interleaving size is greater than step for ");
2148 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2150 return false;
2153 /* Check that the size of the interleaving is equal to STEP for stores,
2154 i.e., that there are no gaps. */
2155 if (dr_step != 0
2156 && absu_hwi (dr_step) != count_in_bytes)
2158 if (DR_IS_READ (dr))
2160 slp_impossible = true;
2161 /* There is a gap after the last load in the group. This gap is a
2162 difference between the groupsize and the number of elements.
2163 When there is no gap, this difference should be 0. */
2164 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2166 else
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170 "interleaved store with gaps");
2171 return false;
2175 /* Check that STEP is a multiple of type size. */
2176 if (dr_step != 0
2177 && (dr_step % type_size) != 0)
2179 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "step is not a multiple of type size: step ");
2183 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2184 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2186 TYPE_SIZE_UNIT (scalar_type));
2188 return false;
2191 if (groupsize == 0)
2192 groupsize = count;
2194 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2195 if (dump_enabled_p ())
2196 dump_printf_loc (MSG_NOTE, vect_location,
2197 "Detected interleaving of size %d", (int)groupsize);
2199 /* SLP: create an SLP data structure for every interleaving group of
2200 stores for further analysis in vect_analyse_slp. */
2201 if (DR_IS_WRITE (dr) && !slp_impossible)
2203 if (loop_vinfo)
2204 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2205 if (bb_vinfo)
2206 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2209 /* There is a gap in the end of the group. */
2210 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 "Data access with gaps requires scalar "
2215 "epilogue loop");
2216 if (loop->inner)
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "Peeling for outer loop is not supported");
2221 return false;
2224 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2228 return true;
2232 /* Analyze the access pattern of the data-reference DR.
2233 In case of non-consecutive accesses call vect_analyze_group_access() to
2234 analyze groups of accesses. */
2236 static bool
2237 vect_analyze_data_ref_access (struct data_reference *dr)
2239 tree step = DR_STEP (dr);
2240 tree scalar_type = TREE_TYPE (DR_REF (dr));
2241 gimple stmt = DR_STMT (dr);
2242 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2243 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2244 struct loop *loop = NULL;
2246 if (loop_vinfo)
2247 loop = LOOP_VINFO_LOOP (loop_vinfo);
2249 if (loop_vinfo && !step)
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253 "bad data-ref access in loop");
2254 return false;
2257 /* Allow invariant loads in loops. */
2258 if (loop_vinfo && integer_zerop (step))
2260 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2261 return DR_IS_READ (dr);
2264 if (loop && nested_in_vect_loop_p (loop, stmt))
2266 /* Interleaved accesses are not yet supported within outer-loop
2267 vectorization for references in the inner-loop. */
2268 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2270 /* For the rest of the analysis we use the outer-loop step. */
2271 step = STMT_VINFO_DR_STEP (stmt_info);
2272 if (integer_zerop (step))
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_NOTE, vect_location,
2276 "zero step in outer loop.");
2277 if (DR_IS_READ (dr))
2278 return true;
2279 else
2280 return false;
2284 /* Consecutive? */
2285 if (TREE_CODE (step) == INTEGER_CST)
2287 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2288 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2289 || (dr_step < 0
2290 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2292 /* Mark that it is not interleaving. */
2293 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2294 return true;
2298 if (loop && nested_in_vect_loop_p (loop, stmt))
2300 if (dump_enabled_p ())
2301 dump_printf_loc (MSG_NOTE, vect_location,
2302 "grouped access in outer loop.");
2303 return false;
2306 /* Assume this is a DR handled by non-constant strided load case. */
2307 if (TREE_CODE (step) != INTEGER_CST)
2308 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2310 /* Not consecutive access - check if it's a part of interleaving group. */
2311 return vect_analyze_group_access (dr);
2316 /* A helper function used in the comparator function to sort data
2317 references. T1 and T2 are two data references to be compared.
2318 The function returns -1, 0, or 1. */
2320 static int
2321 compare_tree (tree t1, tree t2)
2323 int i, cmp;
2324 enum tree_code code;
2325 char tclass;
2327 if (t1 == t2)
2328 return 0;
2329 if (t1 == NULL)
2330 return -1;
2331 if (t2 == NULL)
2332 return 1;
2335 if (TREE_CODE (t1) != TREE_CODE (t2))
2336 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2338 code = TREE_CODE (t1);
2339 switch (code)
2341 /* For const values, we can just use hash values for comparisons. */
2342 case INTEGER_CST:
2343 case REAL_CST:
2344 case FIXED_CST:
2345 case STRING_CST:
2346 case COMPLEX_CST:
2347 case VECTOR_CST:
2349 hashval_t h1 = iterative_hash_expr (t1, 0);
2350 hashval_t h2 = iterative_hash_expr (t2, 0);
2351 if (h1 != h2)
2352 return h1 < h2 ? -1 : 1;
2353 break;
2356 case SSA_NAME:
2357 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2358 if (cmp != 0)
2359 return cmp;
2361 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2362 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2363 break;
2365 default:
2366 tclass = TREE_CODE_CLASS (code);
2368 /* For var-decl, we could compare their UIDs. */
2369 if (tclass == tcc_declaration)
2371 if (DECL_UID (t1) != DECL_UID (t2))
2372 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2373 break;
2376 /* For expressions with operands, compare their operands recursively. */
2377 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2379 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2380 if (cmp != 0)
2381 return cmp;
2385 return 0;
2389 /* Compare two data-references DRA and DRB to group them into chunks
2390 suitable for grouping. */
2392 static int
2393 dr_group_sort_cmp (const void *dra_, const void *drb_)
2395 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2396 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2397 int cmp;
2399 /* Stabilize sort. */
2400 if (dra == drb)
2401 return 0;
2403 /* Ordering of DRs according to base. */
2404 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2406 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2407 if (cmp != 0)
2408 return cmp;
2411 /* And according to DR_OFFSET. */
2412 if (!dr_equal_offsets_p (dra, drb))
2414 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2415 if (cmp != 0)
2416 return cmp;
2419 /* Put reads before writes. */
2420 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2421 return DR_IS_READ (dra) ? -1 : 1;
2423 /* Then sort after access size. */
2424 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2425 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2427 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2428 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2429 if (cmp != 0)
2430 return cmp;
2433 /* And after step. */
2434 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2436 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2437 if (cmp != 0)
2438 return cmp;
2441 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2442 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2443 if (cmp == 0)
2444 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2445 return cmp;
2448 /* Function vect_analyze_data_ref_accesses.
2450 Analyze the access pattern of all the data references in the loop.
2452 FORNOW: the only access pattern that is considered vectorizable is a
2453 simple step 1 (consecutive) access.
2455 FORNOW: handle only arrays and pointer accesses. */
2457 bool
2458 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2460 unsigned int i;
2461 vec<data_reference_p> datarefs;
2462 struct data_reference *dr;
2464 if (dump_enabled_p ())
2465 dump_printf_loc (MSG_NOTE, vect_location,
2466 "=== vect_analyze_data_ref_accesses ===");
2468 if (loop_vinfo)
2469 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2470 else
2471 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2473 if (datarefs.is_empty ())
2474 return true;
2476 /* Sort the array of datarefs to make building the interleaving chains
2477 linear. */
2478 qsort (datarefs.address(), datarefs.length (),
2479 sizeof (data_reference_p), dr_group_sort_cmp);
2481 /* Build the interleaving chains. */
2482 for (i = 0; i < datarefs.length () - 1;)
2484 data_reference_p dra = datarefs[i];
2485 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2486 stmt_vec_info lastinfo = NULL;
2487 for (i = i + 1; i < datarefs.length (); ++i)
2489 data_reference_p drb = datarefs[i];
2490 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2492 /* ??? Imperfect sorting (non-compatible types, non-modulo
2493 accesses, same accesses) can lead to a group to be artificially
2494 split here as we don't just skip over those. If it really
2495 matters we can push those to a worklist and re-iterate
2496 over them. The we can just skip ahead to the next DR here. */
2498 /* Check that the data-refs have same first location (except init)
2499 and they are both either store or load (not load and store). */
2500 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2501 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2502 DR_BASE_ADDRESS (drb), 0)
2503 || !dr_equal_offsets_p (dra, drb))
2504 break;
2506 /* Check that the data-refs have the same constant size and step. */
2507 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2508 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2509 if (!host_integerp (sza, 1)
2510 || !host_integerp (szb, 1)
2511 || !tree_int_cst_equal (sza, szb)
2512 || !host_integerp (DR_STEP (dra), 0)
2513 || !host_integerp (DR_STEP (drb), 0)
2514 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2515 break;
2517 /* Do not place the same access in the interleaving chain twice. */
2518 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2519 break;
2521 /* Check the types are compatible.
2522 ??? We don't distinguish this during sorting. */
2523 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2524 TREE_TYPE (DR_REF (drb))))
2525 break;
2527 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2528 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2529 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2530 gcc_assert (init_a < init_b);
2532 /* If init_b == init_a + the size of the type * k, we have an
2533 interleaving, and DRA is accessed before DRB. */
2534 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2535 if ((init_b - init_a) % type_size_a != 0)
2536 break;
2538 /* The step (if not zero) is greater than the difference between
2539 data-refs' inits. This splits groups into suitable sizes. */
2540 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2541 if (step != 0 && step <= (init_b - init_a))
2542 break;
2544 if (dump_enabled_p ())
2546 dump_printf_loc (MSG_NOTE, vect_location,
2547 "Detected interleaving ");
2548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2549 dump_printf (MSG_NOTE, " and ");
2550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2553 /* Link the found element into the group list. */
2554 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2556 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2557 lastinfo = stmtinfo_a;
2559 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2560 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2561 lastinfo = stmtinfo_b;
2565 FOR_EACH_VEC_ELT (datarefs, i, dr)
2566 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2567 && !vect_analyze_data_ref_access (dr))
2569 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2571 "not vectorized: complicated access pattern.");
2573 if (bb_vinfo)
2575 /* Mark the statement as not vectorizable. */
2576 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2577 continue;
2579 else
2580 return false;
2583 return true;
2586 /* Function vect_prune_runtime_alias_test_list.
2588 Prune a list of ddrs to be tested at run-time by versioning for alias.
2589 Return FALSE if resulting list of ddrs is longer then allowed by
2590 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2592 bool
2593 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2595 vec<ddr_p> ddrs =
2596 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2597 unsigned i, j;
2599 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_NOTE, vect_location,
2601 "=== vect_prune_runtime_alias_test_list ===");
2603 for (i = 0; i < ddrs.length (); )
2605 bool found;
2606 ddr_p ddr_i;
2608 ddr_i = ddrs[i];
2609 found = false;
2611 for (j = 0; j < i; j++)
2613 ddr_p ddr_j = ddrs[j];
2615 if (vect_vfa_range_equal (ddr_i, ddr_j))
2617 if (dump_enabled_p ())
2619 dump_printf_loc (MSG_NOTE, vect_location,
2620 "found equal ranges ");
2621 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2622 dump_printf (MSG_NOTE, ", ");
2623 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2624 dump_printf (MSG_NOTE, " and ");
2625 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2626 dump_printf (MSG_NOTE, ", ");
2627 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2629 found = true;
2630 break;
2634 if (found)
2636 ddrs.ordered_remove (i);
2637 continue;
2639 i++;
2642 if (ddrs.length () >
2643 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2645 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2648 "disable versioning for alias - max number of "
2649 "generated checks exceeded.");
2652 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2654 return false;
2657 return true;
2660 /* Check whether a non-affine read in stmt is suitable for gather load
2661 and if so, return a builtin decl for that operation. */
2663 tree
2664 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2665 tree *offp, int *scalep)
2667 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2668 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2669 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2670 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2671 tree offtype = NULL_TREE;
2672 tree decl, base, off;
2673 enum machine_mode pmode;
2674 int punsignedp, pvolatilep;
2676 /* The gather builtins need address of the form
2677 loop_invariant + vector * {1, 2, 4, 8}
2679 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2680 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2681 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2682 multiplications and additions in it. To get a vector, we need
2683 a single SSA_NAME that will be defined in the loop and will
2684 contain everything that is not loop invariant and that can be
2685 vectorized. The following code attempts to find such a preexistng
2686 SSA_NAME OFF and put the loop invariants into a tree BASE
2687 that can be gimplified before the loop. */
2688 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2689 &pmode, &punsignedp, &pvolatilep, false);
2690 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2692 if (TREE_CODE (base) == MEM_REF)
2694 if (!integer_zerop (TREE_OPERAND (base, 1)))
2696 if (off == NULL_TREE)
2698 double_int moff = mem_ref_offset (base);
2699 off = double_int_to_tree (sizetype, moff);
2701 else
2702 off = size_binop (PLUS_EXPR, off,
2703 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2705 base = TREE_OPERAND (base, 0);
2707 else
2708 base = build_fold_addr_expr (base);
2710 if (off == NULL_TREE)
2711 off = size_zero_node;
2713 /* If base is not loop invariant, either off is 0, then we start with just
2714 the constant offset in the loop invariant BASE and continue with base
2715 as OFF, otherwise give up.
2716 We could handle that case by gimplifying the addition of base + off
2717 into some SSA_NAME and use that as off, but for now punt. */
2718 if (!expr_invariant_in_loop_p (loop, base))
2720 if (!integer_zerop (off))
2721 return NULL_TREE;
2722 off = base;
2723 base = size_int (pbitpos / BITS_PER_UNIT);
2725 /* Otherwise put base + constant offset into the loop invariant BASE
2726 and continue with OFF. */
2727 else
2729 base = fold_convert (sizetype, base);
2730 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2733 /* OFF at this point may be either a SSA_NAME or some tree expression
2734 from get_inner_reference. Try to peel off loop invariants from it
2735 into BASE as long as possible. */
2736 STRIP_NOPS (off);
2737 while (offtype == NULL_TREE)
2739 enum tree_code code;
2740 tree op0, op1, add = NULL_TREE;
2742 if (TREE_CODE (off) == SSA_NAME)
2744 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2746 if (expr_invariant_in_loop_p (loop, off))
2747 return NULL_TREE;
2749 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2750 break;
2752 op0 = gimple_assign_rhs1 (def_stmt);
2753 code = gimple_assign_rhs_code (def_stmt);
2754 op1 = gimple_assign_rhs2 (def_stmt);
2756 else
2758 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2759 return NULL_TREE;
2760 code = TREE_CODE (off);
2761 extract_ops_from_tree (off, &code, &op0, &op1);
2763 switch (code)
2765 case POINTER_PLUS_EXPR:
2766 case PLUS_EXPR:
2767 if (expr_invariant_in_loop_p (loop, op0))
2769 add = op0;
2770 off = op1;
2771 do_add:
2772 add = fold_convert (sizetype, add);
2773 if (scale != 1)
2774 add = size_binop (MULT_EXPR, add, size_int (scale));
2775 base = size_binop (PLUS_EXPR, base, add);
2776 continue;
2778 if (expr_invariant_in_loop_p (loop, op1))
2780 add = op1;
2781 off = op0;
2782 goto do_add;
2784 break;
2785 case MINUS_EXPR:
2786 if (expr_invariant_in_loop_p (loop, op1))
2788 add = fold_convert (sizetype, op1);
2789 add = size_binop (MINUS_EXPR, size_zero_node, add);
2790 off = op0;
2791 goto do_add;
2793 break;
2794 case MULT_EXPR:
2795 if (scale == 1 && host_integerp (op1, 0))
2797 scale = tree_low_cst (op1, 0);
2798 off = op0;
2799 continue;
2801 break;
2802 case SSA_NAME:
2803 off = op0;
2804 continue;
2805 CASE_CONVERT:
2806 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2807 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2808 break;
2809 if (TYPE_PRECISION (TREE_TYPE (op0))
2810 == TYPE_PRECISION (TREE_TYPE (off)))
2812 off = op0;
2813 continue;
2815 if (TYPE_PRECISION (TREE_TYPE (op0))
2816 < TYPE_PRECISION (TREE_TYPE (off)))
2818 off = op0;
2819 offtype = TREE_TYPE (off);
2820 STRIP_NOPS (off);
2821 continue;
2823 break;
2824 default:
2825 break;
2827 break;
2830 /* If at the end OFF still isn't a SSA_NAME or isn't
2831 defined in the loop, punt. */
2832 if (TREE_CODE (off) != SSA_NAME
2833 || expr_invariant_in_loop_p (loop, off))
2834 return NULL_TREE;
2836 if (offtype == NULL_TREE)
2837 offtype = TREE_TYPE (off);
2839 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2840 offtype, scale);
2841 if (decl == NULL_TREE)
2842 return NULL_TREE;
2844 if (basep)
2845 *basep = base;
2846 if (offp)
2847 *offp = off;
2848 if (scalep)
2849 *scalep = scale;
2850 return decl;
2853 /* Function vect_analyze_data_refs.
2855 Find all the data references in the loop or basic block.
2857 The general structure of the analysis of data refs in the vectorizer is as
2858 follows:
2859 1- vect_analyze_data_refs(loop/bb): call
2860 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2861 in the loop/bb and their dependences.
2862 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2863 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2864 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2868 bool
2869 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2870 bb_vec_info bb_vinfo,
2871 int *min_vf)
2873 struct loop *loop = NULL;
2874 basic_block bb = NULL;
2875 unsigned int i;
2876 vec<data_reference_p> datarefs;
2877 struct data_reference *dr;
2878 tree scalar_type;
2880 if (dump_enabled_p ())
2881 dump_printf_loc (MSG_NOTE, vect_location,
2882 "=== vect_analyze_data_refs ===\n");
2884 if (loop_vinfo)
2886 loop = LOOP_VINFO_LOOP (loop_vinfo);
2887 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2888 || find_data_references_in_loop
2889 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2891 if (dump_enabled_p ())
2892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2893 "not vectorized: loop contains function calls"
2894 " or data references that cannot be analyzed");
2895 return false;
2898 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2900 else
2902 gimple_stmt_iterator gsi;
2904 bb = BB_VINFO_BB (bb_vinfo);
2905 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2907 gimple stmt = gsi_stmt (gsi);
2908 if (!find_data_references_in_stmt (NULL, stmt,
2909 &BB_VINFO_DATAREFS (bb_vinfo)))
2911 /* Mark the rest of the basic-block as unvectorizable. */
2912 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2914 stmt = gsi_stmt (gsi);
2915 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2917 break;
2921 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2924 /* Go through the data-refs, check that the analysis succeeded. Update
2925 pointer from stmt_vec_info struct to DR and vectype. */
2927 FOR_EACH_VEC_ELT (datarefs, i, dr)
2929 gimple stmt;
2930 stmt_vec_info stmt_info;
2931 tree base, offset, init;
2932 bool gather = false;
2933 int vf;
2935 again:
2936 if (!dr || !DR_REF (dr))
2938 if (dump_enabled_p ())
2939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2940 "not vectorized: unhandled data-ref ");
2941 return false;
2944 stmt = DR_STMT (dr);
2945 stmt_info = vinfo_for_stmt (stmt);
2947 /* Discard clobbers from the dataref vector. We will remove
2948 clobber stmts during vectorization. */
2949 if (gimple_clobber_p (stmt))
2951 if (i == datarefs.length () - 1)
2953 datarefs.pop ();
2954 break;
2956 datarefs[i] = datarefs.pop ();
2957 goto again;
2960 /* Check that analysis of the data-ref succeeded. */
2961 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2962 || !DR_STEP (dr))
2964 /* If target supports vector gather loads, see if they can't
2965 be used. */
2966 if (loop_vinfo
2967 && DR_IS_READ (dr)
2968 && !TREE_THIS_VOLATILE (DR_REF (dr))
2969 && targetm.vectorize.builtin_gather != NULL
2970 && !nested_in_vect_loop_p (loop, stmt))
2972 struct data_reference *newdr
2973 = create_data_ref (NULL, loop_containing_stmt (stmt),
2974 DR_REF (dr), stmt, true);
2975 gcc_assert (newdr != NULL && DR_REF (newdr));
2976 if (DR_BASE_ADDRESS (newdr)
2977 && DR_OFFSET (newdr)
2978 && DR_INIT (newdr)
2979 && DR_STEP (newdr)
2980 && integer_zerop (DR_STEP (newdr)))
2982 dr = newdr;
2983 gather = true;
2985 else
2986 free_data_ref (newdr);
2989 if (!gather)
2991 if (dump_enabled_p ())
2993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2994 "not vectorized: data ref analysis "
2995 "failed ");
2996 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2999 if (bb_vinfo)
3000 break;
3002 return false;
3006 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010 "not vectorized: base addr of dr is a "
3011 "constant");
3013 if (bb_vinfo)
3014 break;
3016 if (gather)
3017 free_data_ref (dr);
3018 return false;
3021 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3023 if (dump_enabled_p ())
3025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3026 "not vectorized: volatile type ");
3027 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3030 if (bb_vinfo)
3031 break;
3033 return false;
3036 if (stmt_can_throw_internal (stmt))
3038 if (dump_enabled_p ())
3040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3041 "not vectorized: statement can throw an "
3042 "exception ");
3043 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3046 if (bb_vinfo)
3047 break;
3049 if (gather)
3050 free_data_ref (dr);
3051 return false;
3054 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3055 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3057 if (dump_enabled_p ())
3059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3060 "not vectorized: statement is bitfield "
3061 "access ");
3062 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3065 if (bb_vinfo)
3066 break;
3068 if (gather)
3069 free_data_ref (dr);
3070 return false;
3073 base = unshare_expr (DR_BASE_ADDRESS (dr));
3074 offset = unshare_expr (DR_OFFSET (dr));
3075 init = unshare_expr (DR_INIT (dr));
3077 if (is_gimple_call (stmt))
3079 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3082 "not vectorized: dr in a call ");
3083 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3086 if (bb_vinfo)
3087 break;
3089 if (gather)
3090 free_data_ref (dr);
3091 return false;
3094 /* Update DR field in stmt_vec_info struct. */
3096 /* If the dataref is in an inner-loop of the loop that is considered for
3097 for vectorization, we also want to analyze the access relative to
3098 the outer-loop (DR contains information only relative to the
3099 inner-most enclosing loop). We do that by building a reference to the
3100 first location accessed by the inner-loop, and analyze it relative to
3101 the outer-loop. */
3102 if (loop && nested_in_vect_loop_p (loop, stmt))
3104 tree outer_step, outer_base, outer_init;
3105 HOST_WIDE_INT pbitsize, pbitpos;
3106 tree poffset;
3107 enum machine_mode pmode;
3108 int punsignedp, pvolatilep;
3109 affine_iv base_iv, offset_iv;
3110 tree dinit;
3112 /* Build a reference to the first location accessed by the
3113 inner-loop: *(BASE+INIT). (The first location is actually
3114 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3115 tree inner_base = build_fold_indirect_ref
3116 (fold_build_pointer_plus (base, init));
3118 if (dump_enabled_p ())
3120 dump_printf_loc (MSG_NOTE, vect_location,
3121 "analyze in outer-loop: ");
3122 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3125 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3126 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3127 gcc_assert (outer_base != NULL_TREE);
3129 if (pbitpos % BITS_PER_UNIT != 0)
3131 if (dump_enabled_p ())
3132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3133 "failed: bit offset alignment.\n");
3134 return false;
3137 outer_base = build_fold_addr_expr (outer_base);
3138 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3139 &base_iv, false))
3141 if (dump_enabled_p ())
3142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3143 "failed: evolution of base is not affine.\n");
3144 return false;
3147 if (offset)
3149 if (poffset)
3150 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3151 poffset);
3152 else
3153 poffset = offset;
3156 if (!poffset)
3158 offset_iv.base = ssize_int (0);
3159 offset_iv.step = ssize_int (0);
3161 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3162 &offset_iv, false))
3164 if (dump_enabled_p ())
3165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3166 "evolution of offset is not affine.\n");
3167 return false;
3170 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3171 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3172 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3173 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3174 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3176 outer_step = size_binop (PLUS_EXPR,
3177 fold_convert (ssizetype, base_iv.step),
3178 fold_convert (ssizetype, offset_iv.step));
3180 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3181 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3182 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3183 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3184 STMT_VINFO_DR_OFFSET (stmt_info) =
3185 fold_convert (ssizetype, offset_iv.base);
3186 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3187 size_int (highest_pow2_factor (offset_iv.base));
3189 if (dump_enabled_p ())
3191 dump_printf_loc (MSG_NOTE, vect_location,
3192 "\touter base_address: ");
3193 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3194 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3195 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3196 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3197 STMT_VINFO_DR_OFFSET (stmt_info));
3198 dump_printf (MSG_NOTE,
3199 "\n\touter constant offset from base address: ");
3200 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3201 STMT_VINFO_DR_INIT (stmt_info));
3202 dump_printf (MSG_NOTE, "\n\touter step: ");
3203 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3204 STMT_VINFO_DR_STEP (stmt_info));
3205 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3206 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3207 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3211 if (STMT_VINFO_DATA_REF (stmt_info))
3213 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3216 "not vectorized: more than one data ref "
3217 "in stmt: ");
3218 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3221 if (bb_vinfo)
3222 break;
3224 if (gather)
3225 free_data_ref (dr);
3226 return false;
3229 STMT_VINFO_DATA_REF (stmt_info) = dr;
3231 /* Set vectype for STMT. */
3232 scalar_type = TREE_TYPE (DR_REF (dr));
3233 STMT_VINFO_VECTYPE (stmt_info) =
3234 get_vectype_for_scalar_type (scalar_type);
3235 if (!STMT_VINFO_VECTYPE (stmt_info))
3237 if (dump_enabled_p ())
3239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3240 "not vectorized: no vectype for stmt: ");
3241 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3242 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3243 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3244 scalar_type);
3247 if (bb_vinfo)
3248 break;
3250 if (gather)
3252 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3253 free_data_ref (dr);
3255 return false;
3257 else
3259 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "got vectype for stmt: ");
3263 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3264 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3265 STMT_VINFO_VECTYPE (stmt_info));
3269 /* Adjust the minimal vectorization factor according to the
3270 vector type. */
3271 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3272 if (vf > *min_vf)
3273 *min_vf = vf;
3275 if (gather)
3277 tree off;
3279 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3280 if (gather
3281 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3282 gather = false;
3283 if (!gather)
3285 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3286 free_data_ref (dr);
3287 if (dump_enabled_p ())
3289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3290 "not vectorized: not suitable for gather "
3291 "load ");
3292 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3294 return false;
3297 datarefs[i] = dr;
3298 STMT_VINFO_GATHER_P (stmt_info) = true;
3300 else if (loop_vinfo
3301 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3303 if (nested_in_vect_loop_p (loop, stmt)
3304 || !DR_IS_READ (dr))
3306 if (dump_enabled_p ())
3308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3309 "not vectorized: not suitable for strided "
3310 "load ");
3311 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3313 return false;
3315 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3319 /* If we stopped analysis at the first dataref we could not analyze
3320 when trying to vectorize a basic-block mark the rest of the datarefs
3321 as not vectorizable and truncate the vector of datarefs. That
3322 avoids spending useless time in analyzing their dependence. */
3323 if (i != datarefs.length ())
3325 gcc_assert (bb_vinfo != NULL);
3326 for (unsigned j = i; j < datarefs.length (); ++j)
3328 data_reference_p dr = datarefs[j];
3329 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3330 free_data_ref (dr);
3332 datarefs.truncate (i);
3335 return true;
3339 /* Function vect_get_new_vect_var.
3341 Returns a name for a new variable. The current naming scheme appends the
3342 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3343 the name of vectorizer generated variables, and appends that to NAME if
3344 provided. */
3346 tree
3347 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3349 const char *prefix;
3350 tree new_vect_var;
3352 switch (var_kind)
3354 case vect_simple_var:
3355 prefix = "vect";
3356 break;
3357 case vect_scalar_var:
3358 prefix = "stmp";
3359 break;
3360 case vect_pointer_var:
3361 prefix = "vectp";
3362 break;
3363 default:
3364 gcc_unreachable ();
3367 if (name)
3369 char* tmp = concat (prefix, "_", name, NULL);
3370 new_vect_var = create_tmp_reg (type, tmp);
3371 free (tmp);
3373 else
3374 new_vect_var = create_tmp_reg (type, prefix);
3376 return new_vect_var;
3380 /* Function vect_create_addr_base_for_vector_ref.
3382 Create an expression that computes the address of the first memory location
3383 that will be accessed for a data reference.
3385 Input:
3386 STMT: The statement containing the data reference.
3387 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3388 OFFSET: Optional. If supplied, it is be added to the initial address.
3389 LOOP: Specify relative to which loop-nest should the address be computed.
3390 For example, when the dataref is in an inner-loop nested in an
3391 outer-loop that is now being vectorized, LOOP can be either the
3392 outer-loop, or the inner-loop. The first memory location accessed
3393 by the following dataref ('in' points to short):
3395 for (i=0; i<N; i++)
3396 for (j=0; j<M; j++)
3397 s += in[i+j]
3399 is as follows:
3400 if LOOP=i_loop: &in (relative to i_loop)
3401 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3403 Output:
3404 1. Return an SSA_NAME whose value is the address of the memory location of
3405 the first vector of the data reference.
3406 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3407 these statement(s) which define the returned SSA_NAME.
3409 FORNOW: We are only handling array accesses with step 1. */
3411 tree
3412 vect_create_addr_base_for_vector_ref (gimple stmt,
3413 gimple_seq *new_stmt_list,
3414 tree offset,
3415 struct loop *loop)
3417 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3418 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3419 tree data_ref_base;
3420 const char *base_name;
3421 tree addr_base;
3422 tree dest;
3423 gimple_seq seq = NULL;
3424 tree base_offset;
3425 tree init;
3426 tree vect_ptr_type;
3427 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3428 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3430 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3432 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3434 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3436 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3437 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3438 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3440 else
3442 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3443 base_offset = unshare_expr (DR_OFFSET (dr));
3444 init = unshare_expr (DR_INIT (dr));
3447 if (loop_vinfo)
3448 base_name = get_name (data_ref_base);
3449 else
3451 base_offset = ssize_int (0);
3452 init = ssize_int (0);
3453 base_name = get_name (DR_REF (dr));
3456 /* Create base_offset */
3457 base_offset = size_binop (PLUS_EXPR,
3458 fold_convert (sizetype, base_offset),
3459 fold_convert (sizetype, init));
3461 if (offset)
3463 offset = fold_build2 (MULT_EXPR, sizetype,
3464 fold_convert (sizetype, offset), step);
3465 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3466 base_offset, offset);
3469 /* base + base_offset */
3470 if (loop_vinfo)
3471 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3472 else
3474 addr_base = build1 (ADDR_EXPR,
3475 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3476 unshare_expr (DR_REF (dr)));
3479 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3480 addr_base = fold_convert (vect_ptr_type, addr_base);
3481 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3482 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3483 gimple_seq_add_seq (new_stmt_list, seq);
3485 if (DR_PTR_INFO (dr)
3486 && TREE_CODE (addr_base) == SSA_NAME)
3488 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3489 if (offset)
3490 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3493 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3496 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3499 return addr_base;
3503 /* Function vect_create_data_ref_ptr.
3505 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3506 location accessed in the loop by STMT, along with the def-use update
3507 chain to appropriately advance the pointer through the loop iterations.
3508 Also set aliasing information for the pointer. This pointer is used by
3509 the callers to this function to create a memory reference expression for
3510 vector load/store access.
3512 Input:
3513 1. STMT: a stmt that references memory. Expected to be of the form
3514 GIMPLE_ASSIGN <name, data-ref> or
3515 GIMPLE_ASSIGN <data-ref, name>.
3516 2. AGGR_TYPE: the type of the reference, which should be either a vector
3517 or an array.
3518 3. AT_LOOP: the loop where the vector memref is to be created.
3519 4. OFFSET (optional): an offset to be added to the initial address accessed
3520 by the data-ref in STMT.
3521 5. BSI: location where the new stmts are to be placed if there is no loop
3522 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3523 pointing to the initial address.
3525 Output:
3526 1. Declare a new ptr to vector_type, and have it point to the base of the
3527 data reference (initial addressed accessed by the data reference).
3528 For example, for vector of type V8HI, the following code is generated:
3530 v8hi *ap;
3531 ap = (v8hi *)initial_address;
3533 if OFFSET is not supplied:
3534 initial_address = &a[init];
3535 if OFFSET is supplied:
3536 initial_address = &a[init + OFFSET];
3538 Return the initial_address in INITIAL_ADDRESS.
3540 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3541 update the pointer in each iteration of the loop.
3543 Return the increment stmt that updates the pointer in PTR_INCR.
3545 3. Set INV_P to true if the access pattern of the data reference in the
3546 vectorized loop is invariant. Set it to false otherwise.
3548 4. Return the pointer. */
3550 tree
3551 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3552 tree offset, tree *initial_address,
3553 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3554 bool only_init, bool *inv_p)
3556 const char *base_name;
3557 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3558 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3559 struct loop *loop = NULL;
3560 bool nested_in_vect_loop = false;
3561 struct loop *containing_loop = NULL;
3562 tree aggr_ptr_type;
3563 tree aggr_ptr;
3564 tree new_temp;
3565 gimple vec_stmt;
3566 gimple_seq new_stmt_list = NULL;
3567 edge pe = NULL;
3568 basic_block new_bb;
3569 tree aggr_ptr_init;
3570 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3571 tree aptr;
3572 gimple_stmt_iterator incr_gsi;
3573 bool insert_after;
3574 tree indx_before_incr, indx_after_incr;
3575 gimple incr;
3576 tree step;
3577 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3579 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3580 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3582 if (loop_vinfo)
3584 loop = LOOP_VINFO_LOOP (loop_vinfo);
3585 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3586 containing_loop = (gimple_bb (stmt))->loop_father;
3587 pe = loop_preheader_edge (loop);
3589 else
3591 gcc_assert (bb_vinfo);
3592 only_init = true;
3593 *ptr_incr = NULL;
3596 /* Check the step (evolution) of the load in LOOP, and record
3597 whether it's invariant. */
3598 if (nested_in_vect_loop)
3599 step = STMT_VINFO_DR_STEP (stmt_info);
3600 else
3601 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3603 if (integer_zerop (step))
3604 *inv_p = true;
3605 else
3606 *inv_p = false;
3608 /* Create an expression for the first address accessed by this load
3609 in LOOP. */
3610 base_name = get_name (DR_BASE_ADDRESS (dr));
3612 if (dump_enabled_p ())
3614 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3615 dump_printf_loc (MSG_NOTE, vect_location,
3616 "create %s-pointer variable to type: ",
3617 tree_code_name[(int) TREE_CODE (aggr_type)]);
3618 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3619 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3620 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3621 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3622 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3623 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3624 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3625 else
3626 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3627 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3630 /* (1) Create the new aggregate-pointer variable.
3631 Vector and array types inherit the alias set of their component
3632 type by default so we need to use a ref-all pointer if the data
3633 reference does not conflict with the created aggregated data
3634 reference because it is not addressable. */
3635 bool need_ref_all = false;
3636 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3637 get_alias_set (DR_REF (dr))))
3638 need_ref_all = true;
3639 /* Likewise for any of the data references in the stmt group. */
3640 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3642 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3645 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3646 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3647 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3648 get_alias_set (DR_REF (sdr))))
3650 need_ref_all = true;
3651 break;
3653 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3655 while (orig_stmt);
3657 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3658 need_ref_all);
3659 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3662 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3663 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3664 def-use update cycles for the pointer: one relative to the outer-loop
3665 (LOOP), which is what steps (3) and (4) below do. The other is relative
3666 to the inner-loop (which is the inner-most loop containing the dataref),
3667 and this is done be step (5) below.
3669 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3670 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3671 redundant. Steps (3),(4) create the following:
3673 vp0 = &base_addr;
3674 LOOP: vp1 = phi(vp0,vp2)
3677 vp2 = vp1 + step
3678 goto LOOP
3680 If there is an inner-loop nested in loop, then step (5) will also be
3681 applied, and an additional update in the inner-loop will be created:
3683 vp0 = &base_addr;
3684 LOOP: vp1 = phi(vp0,vp2)
3686 inner: vp3 = phi(vp1,vp4)
3687 vp4 = vp3 + inner_step
3688 if () goto inner
3690 vp2 = vp1 + step
3691 if () goto LOOP */
3693 /* (2) Calculate the initial address of the aggregate-pointer, and set
3694 the aggregate-pointer to point to it before the loop. */
3696 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3698 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3699 offset, loop);
3700 if (new_stmt_list)
3702 if (pe)
3704 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3705 gcc_assert (!new_bb);
3707 else
3708 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3711 *initial_address = new_temp;
3713 /* Create: p = (aggr_type *) initial_base */
3714 if (TREE_CODE (new_temp) != SSA_NAME
3715 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3717 vec_stmt = gimple_build_assign (aggr_ptr,
3718 fold_convert (aggr_ptr_type, new_temp));
3719 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3720 /* Copy the points-to information if it exists. */
3721 if (DR_PTR_INFO (dr))
3722 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3723 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3724 if (pe)
3726 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3727 gcc_assert (!new_bb);
3729 else
3730 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3732 else
3733 aggr_ptr_init = new_temp;
3735 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3736 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3737 inner-loop nested in LOOP (during outer-loop vectorization). */
3739 /* No update in loop is required. */
3740 if (only_init && (!loop_vinfo || at_loop == loop))
3741 aptr = aggr_ptr_init;
3742 else
3744 /* The step of the aggregate pointer is the type size. */
3745 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3746 /* One exception to the above is when the scalar step of the load in
3747 LOOP is zero. In this case the step here is also zero. */
3748 if (*inv_p)
3749 iv_step = size_zero_node;
3750 else if (tree_int_cst_sgn (step) == -1)
3751 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3753 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3755 create_iv (aggr_ptr_init,
3756 fold_convert (aggr_ptr_type, iv_step),
3757 aggr_ptr, loop, &incr_gsi, insert_after,
3758 &indx_before_incr, &indx_after_incr);
3759 incr = gsi_stmt (incr_gsi);
3760 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3762 /* Copy the points-to information if it exists. */
3763 if (DR_PTR_INFO (dr))
3765 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3766 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3768 if (ptr_incr)
3769 *ptr_incr = incr;
3771 aptr = indx_before_incr;
3774 if (!nested_in_vect_loop || only_init)
3775 return aptr;
3778 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3779 nested in LOOP, if exists. */
3781 gcc_assert (nested_in_vect_loop);
3782 if (!only_init)
3784 standard_iv_increment_position (containing_loop, &incr_gsi,
3785 &insert_after);
3786 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3787 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3788 &indx_after_incr);
3789 incr = gsi_stmt (incr_gsi);
3790 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3792 /* Copy the points-to information if it exists. */
3793 if (DR_PTR_INFO (dr))
3795 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3796 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3798 if (ptr_incr)
3799 *ptr_incr = incr;
3801 return indx_before_incr;
3803 else
3804 gcc_unreachable ();
3808 /* Function bump_vector_ptr
3810 Increment a pointer (to a vector type) by vector-size. If requested,
3811 i.e. if PTR-INCR is given, then also connect the new increment stmt
3812 to the existing def-use update-chain of the pointer, by modifying
3813 the PTR_INCR as illustrated below:
3815 The pointer def-use update-chain before this function:
3816 DATAREF_PTR = phi (p_0, p_2)
3817 ....
3818 PTR_INCR: p_2 = DATAREF_PTR + step
3820 The pointer def-use update-chain after this function:
3821 DATAREF_PTR = phi (p_0, p_2)
3822 ....
3823 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3824 ....
3825 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3827 Input:
3828 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3829 in the loop.
3830 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3831 the loop. The increment amount across iterations is expected
3832 to be vector_size.
3833 BSI - location where the new update stmt is to be placed.
3834 STMT - the original scalar memory-access stmt that is being vectorized.
3835 BUMP - optional. The offset by which to bump the pointer. If not given,
3836 the offset is assumed to be vector_size.
3838 Output: Return NEW_DATAREF_PTR as illustrated above.
3842 tree
3843 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3844 gimple stmt, tree bump)
3846 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3847 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3848 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3849 tree update = TYPE_SIZE_UNIT (vectype);
3850 gimple incr_stmt;
3851 ssa_op_iter iter;
3852 use_operand_p use_p;
3853 tree new_dataref_ptr;
3855 if (bump)
3856 update = bump;
3858 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3859 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3860 dataref_ptr, update);
3861 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3863 /* Copy the points-to information if it exists. */
3864 if (DR_PTR_INFO (dr))
3866 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3867 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3870 if (!ptr_incr)
3871 return new_dataref_ptr;
3873 /* Update the vector-pointer's cross-iteration increment. */
3874 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3876 tree use = USE_FROM_PTR (use_p);
3878 if (use == dataref_ptr)
3879 SET_USE (use_p, new_dataref_ptr);
3880 else
3881 gcc_assert (tree_int_cst_compare (use, update) == 0);
3884 return new_dataref_ptr;
3888 /* Function vect_create_destination_var.
3890 Create a new temporary of type VECTYPE. */
3892 tree
3893 vect_create_destination_var (tree scalar_dest, tree vectype)
3895 tree vec_dest;
3896 const char *name;
3897 char *new_name;
3898 tree type;
3899 enum vect_var_kind kind;
3901 kind = vectype ? vect_simple_var : vect_scalar_var;
3902 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3904 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3906 name = get_name (scalar_dest);
3907 if (name)
3908 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
3909 else
3910 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
3911 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3912 free (new_name);
3914 return vec_dest;
3917 /* Function vect_grouped_store_supported.
3919 Returns TRUE if interleave high and interleave low permutations
3920 are supported, and FALSE otherwise. */
3922 bool
3923 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3925 enum machine_mode mode = TYPE_MODE (vectype);
3927 /* vect_permute_store_chain requires the group size to be a power of two. */
3928 if (exact_log2 (count) == -1)
3930 if (dump_enabled_p ())
3931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3932 "the size of the group of accesses"
3933 " is not a power of 2");
3934 return false;
3937 /* Check that the permutation is supported. */
3938 if (VECTOR_MODE_P (mode))
3940 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3941 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3942 for (i = 0; i < nelt / 2; i++)
3944 sel[i * 2] = i;
3945 sel[i * 2 + 1] = i + nelt;
3947 if (can_vec_perm_p (mode, false, sel))
3949 for (i = 0; i < nelt; i++)
3950 sel[i] += nelt / 2;
3951 if (can_vec_perm_p (mode, false, sel))
3952 return true;
3956 if (dump_enabled_p ())
3957 dump_printf (MSG_MISSED_OPTIMIZATION,
3958 "interleave op not supported by target.");
3959 return false;
3963 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3964 type VECTYPE. */
3966 bool
3967 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3969 return vect_lanes_optab_supported_p ("vec_store_lanes",
3970 vec_store_lanes_optab,
3971 vectype, count);
3975 /* Function vect_permute_store_chain.
3977 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3978 a power of 2, generate interleave_high/low stmts to reorder the data
3979 correctly for the stores. Return the final references for stores in
3980 RESULT_CHAIN.
3982 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3983 The input is 4 vectors each containing 8 elements. We assign a number to
3984 each element, the input sequence is:
3986 1st vec: 0 1 2 3 4 5 6 7
3987 2nd vec: 8 9 10 11 12 13 14 15
3988 3rd vec: 16 17 18 19 20 21 22 23
3989 4th vec: 24 25 26 27 28 29 30 31
3991 The output sequence should be:
3993 1st vec: 0 8 16 24 1 9 17 25
3994 2nd vec: 2 10 18 26 3 11 19 27
3995 3rd vec: 4 12 20 28 5 13 21 30
3996 4th vec: 6 14 22 30 7 15 23 31
3998 i.e., we interleave the contents of the four vectors in their order.
4000 We use interleave_high/low instructions to create such output. The input of
4001 each interleave_high/low operation is two vectors:
4002 1st vec 2nd vec
4003 0 1 2 3 4 5 6 7
4004 the even elements of the result vector are obtained left-to-right from the
4005 high/low elements of the first vector. The odd elements of the result are
4006 obtained left-to-right from the high/low elements of the second vector.
4007 The output of interleave_high will be: 0 4 1 5
4008 and of interleave_low: 2 6 3 7
4011 The permutation is done in log LENGTH stages. In each stage interleave_high
4012 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4013 where the first argument is taken from the first half of DR_CHAIN and the
4014 second argument from it's second half.
4015 In our example,
4017 I1: interleave_high (1st vec, 3rd vec)
4018 I2: interleave_low (1st vec, 3rd vec)
4019 I3: interleave_high (2nd vec, 4th vec)
4020 I4: interleave_low (2nd vec, 4th vec)
4022 The output for the first stage is:
4024 I1: 0 16 1 17 2 18 3 19
4025 I2: 4 20 5 21 6 22 7 23
4026 I3: 8 24 9 25 10 26 11 27
4027 I4: 12 28 13 29 14 30 15 31
4029 The output of the second stage, i.e. the final result is:
4031 I1: 0 8 16 24 1 9 17 25
4032 I2: 2 10 18 26 3 11 19 27
4033 I3: 4 12 20 28 5 13 21 30
4034 I4: 6 14 22 30 7 15 23 31. */
4036 void
4037 vect_permute_store_chain (vec<tree> dr_chain,
4038 unsigned int length,
4039 gimple stmt,
4040 gimple_stmt_iterator *gsi,
4041 vec<tree> *result_chain)
4043 tree vect1, vect2, high, low;
4044 gimple perm_stmt;
4045 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4046 tree perm_mask_low, perm_mask_high;
4047 unsigned int i, n;
4048 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4049 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4051 result_chain->quick_grow (length);
4052 memcpy (result_chain->address (), dr_chain.address (),
4053 length * sizeof (tree));
4055 for (i = 0, n = nelt / 2; i < n; i++)
4057 sel[i * 2] = i;
4058 sel[i * 2 + 1] = i + nelt;
4060 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4061 gcc_assert (perm_mask_high != NULL);
4063 for (i = 0; i < nelt; i++)
4064 sel[i] += nelt / 2;
4065 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4066 gcc_assert (perm_mask_low != NULL);
4068 for (i = 0, n = exact_log2 (length); i < n; i++)
4070 for (j = 0; j < length/2; j++)
4072 vect1 = dr_chain[j];
4073 vect2 = dr_chain[j+length/2];
4075 /* Create interleaving stmt:
4076 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4077 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4078 perm_stmt
4079 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4080 vect1, vect2, perm_mask_high);
4081 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4082 (*result_chain)[2*j] = high;
4084 /* Create interleaving stmt:
4085 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4086 nelt*3/2+1, ...}> */
4087 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4088 perm_stmt
4089 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4090 vect1, vect2, perm_mask_low);
4091 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4092 (*result_chain)[2*j+1] = low;
4094 memcpy (dr_chain.address (), result_chain->address (),
4095 length * sizeof (tree));
4099 /* Function vect_setup_realignment
4101 This function is called when vectorizing an unaligned load using
4102 the dr_explicit_realign[_optimized] scheme.
4103 This function generates the following code at the loop prolog:
4105 p = initial_addr;
4106 x msq_init = *(floor(p)); # prolog load
4107 realignment_token = call target_builtin;
4108 loop:
4109 x msq = phi (msq_init, ---)
4111 The stmts marked with x are generated only for the case of
4112 dr_explicit_realign_optimized.
4114 The code above sets up a new (vector) pointer, pointing to the first
4115 location accessed by STMT, and a "floor-aligned" load using that pointer.
4116 It also generates code to compute the "realignment-token" (if the relevant
4117 target hook was defined), and creates a phi-node at the loop-header bb
4118 whose arguments are the result of the prolog-load (created by this
4119 function) and the result of a load that takes place in the loop (to be
4120 created by the caller to this function).
4122 For the case of dr_explicit_realign_optimized:
4123 The caller to this function uses the phi-result (msq) to create the
4124 realignment code inside the loop, and sets up the missing phi argument,
4125 as follows:
4126 loop:
4127 msq = phi (msq_init, lsq)
4128 lsq = *(floor(p')); # load in loop
4129 result = realign_load (msq, lsq, realignment_token);
4131 For the case of dr_explicit_realign:
4132 loop:
4133 msq = *(floor(p)); # load in loop
4134 p' = p + (VS-1);
4135 lsq = *(floor(p')); # load in loop
4136 result = realign_load (msq, lsq, realignment_token);
4138 Input:
4139 STMT - (scalar) load stmt to be vectorized. This load accesses
4140 a memory location that may be unaligned.
4141 BSI - place where new code is to be inserted.
4142 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4143 is used.
4145 Output:
4146 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4147 target hook, if defined.
4148 Return value - the result of the loop-header phi node. */
4150 tree
4151 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4152 tree *realignment_token,
4153 enum dr_alignment_support alignment_support_scheme,
4154 tree init_addr,
4155 struct loop **at_loop)
4157 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4158 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4159 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4160 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4161 struct loop *loop = NULL;
4162 edge pe = NULL;
4163 tree scalar_dest = gimple_assign_lhs (stmt);
4164 tree vec_dest;
4165 gimple inc;
4166 tree ptr;
4167 tree data_ref;
4168 gimple new_stmt;
4169 basic_block new_bb;
4170 tree msq_init = NULL_TREE;
4171 tree new_temp;
4172 gimple phi_stmt;
4173 tree msq = NULL_TREE;
4174 gimple_seq stmts = NULL;
4175 bool inv_p;
4176 bool compute_in_loop = false;
4177 bool nested_in_vect_loop = false;
4178 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4179 struct loop *loop_for_initial_load = NULL;
4181 if (loop_vinfo)
4183 loop = LOOP_VINFO_LOOP (loop_vinfo);
4184 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4187 gcc_assert (alignment_support_scheme == dr_explicit_realign
4188 || alignment_support_scheme == dr_explicit_realign_optimized);
4190 /* We need to generate three things:
4191 1. the misalignment computation
4192 2. the extra vector load (for the optimized realignment scheme).
4193 3. the phi node for the two vectors from which the realignment is
4194 done (for the optimized realignment scheme). */
4196 /* 1. Determine where to generate the misalignment computation.
4198 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4199 calculation will be generated by this function, outside the loop (in the
4200 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4201 caller, inside the loop.
4203 Background: If the misalignment remains fixed throughout the iterations of
4204 the loop, then both realignment schemes are applicable, and also the
4205 misalignment computation can be done outside LOOP. This is because we are
4206 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4207 are a multiple of VS (the Vector Size), and therefore the misalignment in
4208 different vectorized LOOP iterations is always the same.
4209 The problem arises only if the memory access is in an inner-loop nested
4210 inside LOOP, which is now being vectorized using outer-loop vectorization.
4211 This is the only case when the misalignment of the memory access may not
4212 remain fixed throughout the iterations of the inner-loop (as explained in
4213 detail in vect_supportable_dr_alignment). In this case, not only is the
4214 optimized realignment scheme not applicable, but also the misalignment
4215 computation (and generation of the realignment token that is passed to
4216 REALIGN_LOAD) have to be done inside the loop.
4218 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4219 or not, which in turn determines if the misalignment is computed inside
4220 the inner-loop, or outside LOOP. */
4222 if (init_addr != NULL_TREE || !loop_vinfo)
4224 compute_in_loop = true;
4225 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4229 /* 2. Determine where to generate the extra vector load.
4231 For the optimized realignment scheme, instead of generating two vector
4232 loads in each iteration, we generate a single extra vector load in the
4233 preheader of the loop, and in each iteration reuse the result of the
4234 vector load from the previous iteration. In case the memory access is in
4235 an inner-loop nested inside LOOP, which is now being vectorized using
4236 outer-loop vectorization, we need to determine whether this initial vector
4237 load should be generated at the preheader of the inner-loop, or can be
4238 generated at the preheader of LOOP. If the memory access has no evolution
4239 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4240 to be generated inside LOOP (in the preheader of the inner-loop). */
4242 if (nested_in_vect_loop)
4244 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4245 bool invariant_in_outerloop =
4246 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4247 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4249 else
4250 loop_for_initial_load = loop;
4251 if (at_loop)
4252 *at_loop = loop_for_initial_load;
4254 if (loop_for_initial_load)
4255 pe = loop_preheader_edge (loop_for_initial_load);
4257 /* 3. For the case of the optimized realignment, create the first vector
4258 load at the loop preheader. */
4260 if (alignment_support_scheme == dr_explicit_realign_optimized)
4262 /* Create msq_init = *(floor(p1)) in the loop preheader */
4264 gcc_assert (!compute_in_loop);
4265 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4266 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4267 NULL_TREE, &init_addr, NULL, &inc,
4268 true, &inv_p);
4269 new_temp = copy_ssa_name (ptr, NULL);
4270 new_stmt = gimple_build_assign_with_ops
4271 (BIT_AND_EXPR, new_temp, ptr,
4272 build_int_cst (TREE_TYPE (ptr),
4273 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4274 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4275 gcc_assert (!new_bb);
4276 data_ref
4277 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4278 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4279 new_stmt = gimple_build_assign (vec_dest, data_ref);
4280 new_temp = make_ssa_name (vec_dest, new_stmt);
4281 gimple_assign_set_lhs (new_stmt, new_temp);
4282 if (pe)
4284 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4285 gcc_assert (!new_bb);
4287 else
4288 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4290 msq_init = gimple_assign_lhs (new_stmt);
4293 /* 4. Create realignment token using a target builtin, if available.
4294 It is done either inside the containing loop, or before LOOP (as
4295 determined above). */
4297 if (targetm.vectorize.builtin_mask_for_load)
4299 tree builtin_decl;
4301 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4302 if (!init_addr)
4304 /* Generate the INIT_ADDR computation outside LOOP. */
4305 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4306 NULL_TREE, loop);
4307 if (loop)
4309 pe = loop_preheader_edge (loop);
4310 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4311 gcc_assert (!new_bb);
4313 else
4314 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4317 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4318 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4319 vec_dest =
4320 vect_create_destination_var (scalar_dest,
4321 gimple_call_return_type (new_stmt));
4322 new_temp = make_ssa_name (vec_dest, new_stmt);
4323 gimple_call_set_lhs (new_stmt, new_temp);
4325 if (compute_in_loop)
4326 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4327 else
4329 /* Generate the misalignment computation outside LOOP. */
4330 pe = loop_preheader_edge (loop);
4331 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4332 gcc_assert (!new_bb);
4335 *realignment_token = gimple_call_lhs (new_stmt);
4337 /* The result of the CALL_EXPR to this builtin is determined from
4338 the value of the parameter and no global variables are touched
4339 which makes the builtin a "const" function. Requiring the
4340 builtin to have the "const" attribute makes it unnecessary
4341 to call mark_call_clobbered. */
4342 gcc_assert (TREE_READONLY (builtin_decl));
4345 if (alignment_support_scheme == dr_explicit_realign)
4346 return msq;
4348 gcc_assert (!compute_in_loop);
4349 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4352 /* 5. Create msq = phi <msq_init, lsq> in loop */
4354 pe = loop_preheader_edge (containing_loop);
4355 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4356 msq = make_ssa_name (vec_dest, NULL);
4357 phi_stmt = create_phi_node (msq, containing_loop->header);
4358 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4360 return msq;
4364 /* Function vect_grouped_load_supported.
4366 Returns TRUE if even and odd permutations are supported,
4367 and FALSE otherwise. */
4369 bool
4370 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4372 enum machine_mode mode = TYPE_MODE (vectype);
4374 /* vect_permute_load_chain requires the group size to be a power of two. */
4375 if (exact_log2 (count) == -1)
4377 if (dump_enabled_p ())
4378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4379 "the size of the group of accesses"
4380 " is not a power of 2");
4381 return false;
4384 /* Check that the permutation is supported. */
4385 if (VECTOR_MODE_P (mode))
4387 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4388 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4390 for (i = 0; i < nelt; i++)
4391 sel[i] = i * 2;
4392 if (can_vec_perm_p (mode, false, sel))
4394 for (i = 0; i < nelt; i++)
4395 sel[i] = i * 2 + 1;
4396 if (can_vec_perm_p (mode, false, sel))
4397 return true;
4401 if (dump_enabled_p ())
4402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4403 "extract even/odd not supported by target");
4404 return false;
4407 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4408 type VECTYPE. */
4410 bool
4411 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4413 return vect_lanes_optab_supported_p ("vec_load_lanes",
4414 vec_load_lanes_optab,
4415 vectype, count);
4418 /* Function vect_permute_load_chain.
4420 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4421 a power of 2, generate extract_even/odd stmts to reorder the input data
4422 correctly. Return the final references for loads in RESULT_CHAIN.
4424 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4425 The input is 4 vectors each containing 8 elements. We assign a number to each
4426 element, the input sequence is:
4428 1st vec: 0 1 2 3 4 5 6 7
4429 2nd vec: 8 9 10 11 12 13 14 15
4430 3rd vec: 16 17 18 19 20 21 22 23
4431 4th vec: 24 25 26 27 28 29 30 31
4433 The output sequence should be:
4435 1st vec: 0 4 8 12 16 20 24 28
4436 2nd vec: 1 5 9 13 17 21 25 29
4437 3rd vec: 2 6 10 14 18 22 26 30
4438 4th vec: 3 7 11 15 19 23 27 31
4440 i.e., the first output vector should contain the first elements of each
4441 interleaving group, etc.
4443 We use extract_even/odd instructions to create such output. The input of
4444 each extract_even/odd operation is two vectors
4445 1st vec 2nd vec
4446 0 1 2 3 4 5 6 7
4448 and the output is the vector of extracted even/odd elements. The output of
4449 extract_even will be: 0 2 4 6
4450 and of extract_odd: 1 3 5 7
4453 The permutation is done in log LENGTH stages. In each stage extract_even
4454 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4455 their order. In our example,
4457 E1: extract_even (1st vec, 2nd vec)
4458 E2: extract_odd (1st vec, 2nd vec)
4459 E3: extract_even (3rd vec, 4th vec)
4460 E4: extract_odd (3rd vec, 4th vec)
4462 The output for the first stage will be:
4464 E1: 0 2 4 6 8 10 12 14
4465 E2: 1 3 5 7 9 11 13 15
4466 E3: 16 18 20 22 24 26 28 30
4467 E4: 17 19 21 23 25 27 29 31
4469 In order to proceed and create the correct sequence for the next stage (or
4470 for the correct output, if the second stage is the last one, as in our
4471 example), we first put the output of extract_even operation and then the
4472 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4473 The input for the second stage is:
4475 1st vec (E1): 0 2 4 6 8 10 12 14
4476 2nd vec (E3): 16 18 20 22 24 26 28 30
4477 3rd vec (E2): 1 3 5 7 9 11 13 15
4478 4th vec (E4): 17 19 21 23 25 27 29 31
4480 The output of the second stage:
4482 E1: 0 4 8 12 16 20 24 28
4483 E2: 2 6 10 14 18 22 26 30
4484 E3: 1 5 9 13 17 21 25 29
4485 E4: 3 7 11 15 19 23 27 31
4487 And RESULT_CHAIN after reordering:
4489 1st vec (E1): 0 4 8 12 16 20 24 28
4490 2nd vec (E3): 1 5 9 13 17 21 25 29
4491 3rd vec (E2): 2 6 10 14 18 22 26 30
4492 4th vec (E4): 3 7 11 15 19 23 27 31. */
4494 static void
4495 vect_permute_load_chain (vec<tree> dr_chain,
4496 unsigned int length,
4497 gimple stmt,
4498 gimple_stmt_iterator *gsi,
4499 vec<tree> *result_chain)
4501 tree data_ref, first_vect, second_vect;
4502 tree perm_mask_even, perm_mask_odd;
4503 gimple perm_stmt;
4504 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4505 unsigned int i, j, log_length = exact_log2 (length);
4506 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4507 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4509 result_chain->quick_grow (length);
4510 memcpy (result_chain->address (), dr_chain.address (),
4511 length * sizeof (tree));
4513 for (i = 0; i < nelt; ++i)
4514 sel[i] = i * 2;
4515 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4516 gcc_assert (perm_mask_even != NULL);
4518 for (i = 0; i < nelt; ++i)
4519 sel[i] = i * 2 + 1;
4520 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4521 gcc_assert (perm_mask_odd != NULL);
4523 for (i = 0; i < log_length; i++)
4525 for (j = 0; j < length; j += 2)
4527 first_vect = dr_chain[j];
4528 second_vect = dr_chain[j+1];
4530 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4531 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4532 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4533 first_vect, second_vect,
4534 perm_mask_even);
4535 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4536 (*result_chain)[j/2] = data_ref;
4538 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4539 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4540 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4541 first_vect, second_vect,
4542 perm_mask_odd);
4543 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4544 (*result_chain)[j/2+length/2] = data_ref;
4546 memcpy (dr_chain.address (), result_chain->address (),
4547 length * sizeof (tree));
4552 /* Function vect_transform_grouped_load.
4554 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4555 to perform their permutation and ascribe the result vectorized statements to
4556 the scalar statements.
4559 void
4560 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4561 gimple_stmt_iterator *gsi)
4563 vec<tree> result_chain = vNULL;
4565 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4566 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4567 vectors, that are ready for vector computation. */
4568 result_chain.create (size);
4569 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4570 vect_record_grouped_load_vectors (stmt, result_chain);
4571 result_chain.release ();
4574 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4575 generated as part of the vectorization of STMT. Assign the statement
4576 for each vector to the associated scalar statement. */
4578 void
4579 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4581 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4582 gimple next_stmt, new_stmt;
4583 unsigned int i, gap_count;
4584 tree tmp_data_ref;
4586 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4587 Since we scan the chain starting from it's first node, their order
4588 corresponds the order of data-refs in RESULT_CHAIN. */
4589 next_stmt = first_stmt;
4590 gap_count = 1;
4591 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4593 if (!next_stmt)
4594 break;
4596 /* Skip the gaps. Loads created for the gaps will be removed by dead
4597 code elimination pass later. No need to check for the first stmt in
4598 the group, since it always exists.
4599 GROUP_GAP is the number of steps in elements from the previous
4600 access (if there is no gap GROUP_GAP is 1). We skip loads that
4601 correspond to the gaps. */
4602 if (next_stmt != first_stmt
4603 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4605 gap_count++;
4606 continue;
4609 while (next_stmt)
4611 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4612 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4613 copies, and we put the new vector statement in the first available
4614 RELATED_STMT. */
4615 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4616 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4617 else
4619 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4621 gimple prev_stmt =
4622 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4623 gimple rel_stmt =
4624 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4625 while (rel_stmt)
4627 prev_stmt = rel_stmt;
4628 rel_stmt =
4629 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4632 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4633 new_stmt;
4637 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4638 gap_count = 1;
4639 /* If NEXT_STMT accesses the same DR as the previous statement,
4640 put the same TMP_DATA_REF as its vectorized statement; otherwise
4641 get the next data-ref from RESULT_CHAIN. */
4642 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4643 break;
4648 /* Function vect_force_dr_alignment_p.
4650 Returns whether the alignment of a DECL can be forced to be aligned
4651 on ALIGNMENT bit boundary. */
4653 bool
4654 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4656 if (TREE_CODE (decl) != VAR_DECL)
4657 return false;
4659 /* We cannot change alignment of common or external symbols as another
4660 translation unit may contain a definition with lower alignment.
4661 The rules of common symbol linking mean that the definition
4662 will override the common symbol. The same is true for constant
4663 pool entries which may be shared and are not properly merged
4664 by LTO. */
4665 if (DECL_EXTERNAL (decl)
4666 || DECL_COMMON (decl)
4667 || DECL_IN_CONSTANT_POOL (decl))
4668 return false;
4670 if (TREE_ASM_WRITTEN (decl))
4671 return false;
4673 /* Do not override the alignment as specified by the ABI when the used
4674 attribute is set. */
4675 if (DECL_PRESERVE_P (decl))
4676 return false;
4678 /* Do not override explicit alignment set by the user when an explicit
4679 section name is also used. This is a common idiom used by many
4680 software projects. */
4681 if (DECL_SECTION_NAME (decl) != NULL_TREE
4682 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4683 return false;
4685 if (TREE_STATIC (decl))
4686 return (alignment <= MAX_OFILE_ALIGNMENT);
4687 else
4688 return (alignment <= MAX_STACK_ALIGNMENT);
4692 /* Return whether the data reference DR is supported with respect to its
4693 alignment.
4694 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4695 it is aligned, i.e., check if it is possible to vectorize it with different
4696 alignment. */
4698 enum dr_alignment_support
4699 vect_supportable_dr_alignment (struct data_reference *dr,
4700 bool check_aligned_accesses)
4702 gimple stmt = DR_STMT (dr);
4703 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4704 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4705 enum machine_mode mode = TYPE_MODE (vectype);
4706 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4707 struct loop *vect_loop = NULL;
4708 bool nested_in_vect_loop = false;
4710 if (aligned_access_p (dr) && !check_aligned_accesses)
4711 return dr_aligned;
4713 if (loop_vinfo)
4715 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4716 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4719 /* Possibly unaligned access. */
4721 /* We can choose between using the implicit realignment scheme (generating
4722 a misaligned_move stmt) and the explicit realignment scheme (generating
4723 aligned loads with a REALIGN_LOAD). There are two variants to the
4724 explicit realignment scheme: optimized, and unoptimized.
4725 We can optimize the realignment only if the step between consecutive
4726 vector loads is equal to the vector size. Since the vector memory
4727 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4728 is guaranteed that the misalignment amount remains the same throughout the
4729 execution of the vectorized loop. Therefore, we can create the
4730 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4731 at the loop preheader.
4733 However, in the case of outer-loop vectorization, when vectorizing a
4734 memory access in the inner-loop nested within the LOOP that is now being
4735 vectorized, while it is guaranteed that the misalignment of the
4736 vectorized memory access will remain the same in different outer-loop
4737 iterations, it is *not* guaranteed that is will remain the same throughout
4738 the execution of the inner-loop. This is because the inner-loop advances
4739 with the original scalar step (and not in steps of VS). If the inner-loop
4740 step happens to be a multiple of VS, then the misalignment remains fixed
4741 and we can use the optimized realignment scheme. For example:
4743 for (i=0; i<N; i++)
4744 for (j=0; j<M; j++)
4745 s += a[i+j];
4747 When vectorizing the i-loop in the above example, the step between
4748 consecutive vector loads is 1, and so the misalignment does not remain
4749 fixed across the execution of the inner-loop, and the realignment cannot
4750 be optimized (as illustrated in the following pseudo vectorized loop):
4752 for (i=0; i<N; i+=4)
4753 for (j=0; j<M; j++){
4754 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4755 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4756 // (assuming that we start from an aligned address).
4759 We therefore have to use the unoptimized realignment scheme:
4761 for (i=0; i<N; i+=4)
4762 for (j=k; j<M; j+=4)
4763 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4764 // that the misalignment of the initial address is
4765 // 0).
4767 The loop can then be vectorized as follows:
4769 for (k=0; k<4; k++){
4770 rt = get_realignment_token (&vp[k]);
4771 for (i=0; i<N; i+=4){
4772 v1 = vp[i+k];
4773 for (j=k; j<M; j+=4){
4774 v2 = vp[i+j+VS-1];
4775 va = REALIGN_LOAD <v1,v2,rt>;
4776 vs += va;
4777 v1 = v2;
4780 } */
4782 if (DR_IS_READ (dr))
4784 bool is_packed = false;
4785 tree type = (TREE_TYPE (DR_REF (dr)));
4787 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4788 && (!targetm.vectorize.builtin_mask_for_load
4789 || targetm.vectorize.builtin_mask_for_load ()))
4791 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4792 if ((nested_in_vect_loop
4793 && (TREE_INT_CST_LOW (DR_STEP (dr))
4794 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4795 || !loop_vinfo)
4796 return dr_explicit_realign;
4797 else
4798 return dr_explicit_realign_optimized;
4800 if (!known_alignment_for_access_p (dr))
4801 is_packed = not_size_aligned (DR_REF (dr));
4803 if ((TYPE_USER_ALIGN (type) && !is_packed)
4804 || targetm.vectorize.
4805 support_vector_misalignment (mode, type,
4806 DR_MISALIGNMENT (dr), is_packed))
4807 /* Can't software pipeline the loads, but can at least do them. */
4808 return dr_unaligned_supported;
4810 else
4812 bool is_packed = false;
4813 tree type = (TREE_TYPE (DR_REF (dr)));
4815 if (!known_alignment_for_access_p (dr))
4816 is_packed = not_size_aligned (DR_REF (dr));
4818 if ((TYPE_USER_ALIGN (type) && !is_packed)
4819 || targetm.vectorize.
4820 support_vector_misalignment (mode, type,
4821 DR_MISALIGNMENT (dr), is_packed))
4822 return dr_unaligned_supported;
4825 /* Unsupported. */
4826 return dr_unaligned_unsupported;