Mark ChangeLog
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob1ff0626da5bb4c1234cf55e1f7a8e7f0e7e25b00
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
42 #include "expr.h"
43 #include "optabs.h"
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
48 static bool
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
52 enum machine_mode mode, array_mode;
53 bool limit_p;
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
58 MODE_INT, limit_p);
60 if (array_mode == BLKmode)
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
66 return false;
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
75 return false;
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
83 return true;
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
102 types. */
104 tree
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
122 if (rhs < lhs)
123 scalar_type = rhs_type;
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
128 return scalar_type;
132 /* Find the place of the data-ref in STMT in the interleaving chain that starts
133 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
136 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
138 gimple next_stmt = first_stmt;
139 int result = 0;
141 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
142 return -1;
144 while (next_stmt && next_stmt != stmt)
146 result++;
147 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
150 if (next_stmt)
151 return result;
152 else
153 return -1;
157 /* Function vect_insert_into_interleaving_chain.
159 Insert DRA into the interleaving chain of DRB according to DRA's INIT. */
161 static void
162 vect_insert_into_interleaving_chain (struct data_reference *dra,
163 struct data_reference *drb)
165 gimple prev, next;
166 tree next_init;
167 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
168 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
170 prev = GROUP_FIRST_ELEMENT (stmtinfo_b);
171 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
172 while (next)
174 next_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
175 if (tree_int_cst_compare (next_init, DR_INIT (dra)) > 0)
177 /* Insert here. */
178 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = DR_STMT (dra);
179 GROUP_NEXT_ELEMENT (stmtinfo_a) = next;
180 return;
182 prev = next;
183 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
186 /* We got to the end of the list. Insert here. */
187 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = DR_STMT (dra);
188 GROUP_NEXT_ELEMENT (stmtinfo_a) = NULL;
192 /* Function vect_update_interleaving_chain.
194 For two data-refs DRA and DRB that are a part of a chain interleaved data
195 accesses, update the interleaving chain. DRB's INIT is smaller than DRA's.
197 There are four possible cases:
198 1. New stmts - both DRA and DRB are not a part of any chain:
199 FIRST_DR = DRB
200 NEXT_DR (DRB) = DRA
201 2. DRB is a part of a chain and DRA is not:
202 no need to update FIRST_DR
203 no need to insert DRB
204 insert DRA according to init
205 3. DRA is a part of a chain and DRB is not:
206 if (init of FIRST_DR > init of DRB)
207 FIRST_DR = DRB
208 NEXT(FIRST_DR) = previous FIRST_DR
209 else
210 insert DRB according to its init
211 4. both DRA and DRB are in some interleaving chains:
212 choose the chain with the smallest init of FIRST_DR
213 insert the nodes of the second chain into the first one. */
215 static void
216 vect_update_interleaving_chain (struct data_reference *drb,
217 struct data_reference *dra)
219 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
220 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
221 tree next_init, init_dra_chain, init_drb_chain;
222 gimple first_a, first_b;
223 tree node_init;
224 gimple node, prev, next, first_stmt;
226 /* 1. New stmts - both DRA and DRB are not a part of any chain. */
227 if (!GROUP_FIRST_ELEMENT (stmtinfo_a) && !GROUP_FIRST_ELEMENT (stmtinfo_b))
229 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (drb);
230 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (drb);
231 GROUP_NEXT_ELEMENT (stmtinfo_b) = DR_STMT (dra);
232 return;
235 /* 2. DRB is a part of a chain and DRA is not. */
236 if (!GROUP_FIRST_ELEMENT (stmtinfo_a) && GROUP_FIRST_ELEMENT (stmtinfo_b))
238 GROUP_FIRST_ELEMENT (stmtinfo_a) = GROUP_FIRST_ELEMENT (stmtinfo_b);
239 /* Insert DRA into the chain of DRB. */
240 vect_insert_into_interleaving_chain (dra, drb);
241 return;
244 /* 3. DRA is a part of a chain and DRB is not. */
245 if (GROUP_FIRST_ELEMENT (stmtinfo_a) && !GROUP_FIRST_ELEMENT (stmtinfo_b))
247 gimple old_first_stmt = GROUP_FIRST_ELEMENT (stmtinfo_a);
248 tree init_old = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (
249 old_first_stmt)));
250 gimple tmp;
252 if (tree_int_cst_compare (init_old, DR_INIT (drb)) > 0)
254 /* DRB's init is smaller than the init of the stmt previously marked
255 as the first stmt of the interleaving chain of DRA. Therefore, we
256 update FIRST_STMT and put DRB in the head of the list. */
257 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (drb);
258 GROUP_NEXT_ELEMENT (stmtinfo_b) = old_first_stmt;
260 /* Update all the stmts in the list to point to the new FIRST_STMT. */
261 tmp = old_first_stmt;
262 while (tmp)
264 GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) = DR_STMT (drb);
265 tmp = GROUP_NEXT_ELEMENT (vinfo_for_stmt (tmp));
268 else
270 /* Insert DRB in the list of DRA. */
271 vect_insert_into_interleaving_chain (drb, dra);
272 GROUP_FIRST_ELEMENT (stmtinfo_b) = GROUP_FIRST_ELEMENT (stmtinfo_a);
274 return;
277 /* 4. both DRA and DRB are in some interleaving chains. */
278 first_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
279 first_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
280 if (first_a == first_b)
281 return;
282 init_dra_chain = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_a)));
283 init_drb_chain = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_b)));
285 if (tree_int_cst_compare (init_dra_chain, init_drb_chain) > 0)
287 /* Insert the nodes of DRA chain into the DRB chain.
288 After inserting a node, continue from this node of the DRB chain (don't
289 start from the beginning. */
290 node = GROUP_FIRST_ELEMENT (stmtinfo_a);
291 prev = GROUP_FIRST_ELEMENT (stmtinfo_b);
292 first_stmt = first_b;
294 else
296 /* Insert the nodes of DRB chain into the DRA chain.
297 After inserting a node, continue from this node of the DRA chain (don't
298 start from the beginning. */
299 node = GROUP_FIRST_ELEMENT (stmtinfo_b);
300 prev = GROUP_FIRST_ELEMENT (stmtinfo_a);
301 first_stmt = first_a;
304 while (node)
306 node_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (node)));
307 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
308 while (next)
310 next_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
311 if (tree_int_cst_compare (next_init, node_init) > 0)
313 /* Insert here. */
314 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = node;
315 GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)) = next;
316 prev = node;
317 break;
319 prev = next;
320 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
322 if (!next)
324 /* We got to the end of the list. Insert here. */
325 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = node;
326 GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)) = NULL;
327 prev = node;
329 GROUP_FIRST_ELEMENT (vinfo_for_stmt (node)) = first_stmt;
330 node = GROUP_NEXT_ELEMENT (vinfo_for_stmt (node));
334 /* Check dependence between DRA and DRB for basic block vectorization.
335 If the accesses share same bases and offsets, we can compare their initial
336 constant offsets to decide whether they differ or not. In case of a read-
337 write dependence we check that the load is before the store to ensure that
338 vectorization will not change the order of the accesses. */
340 static bool
341 vect_drs_dependent_in_basic_block (struct data_reference *dra,
342 struct data_reference *drb)
344 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
345 gimple earlier_stmt;
347 /* We only call this function for pairs of loads and stores, but we verify
348 it here. */
349 if (DR_IS_READ (dra) == DR_IS_READ (drb))
351 if (DR_IS_READ (dra))
352 return false;
353 else
354 return true;
357 /* Check that the data-refs have same bases and offsets. If not, we can't
358 determine if they are dependent. */
359 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
360 || !dr_equal_offsets_p (dra, drb))
361 return true;
363 /* Check the types. */
364 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
365 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
367 if (type_size_a != type_size_b
368 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
369 TREE_TYPE (DR_REF (drb))))
370 return true;
372 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
373 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
375 /* Two different locations - no dependence. */
376 if (init_a != init_b)
377 return false;
379 /* We have a read-write dependence. Check that the load is before the store.
380 When we vectorize basic blocks, vector load can be only before
381 corresponding scalar load, and vector store can be only after its
382 corresponding scalar store. So the order of the acceses is preserved in
383 case the load is before the store. */
384 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
385 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
386 return false;
388 return true;
392 /* Function vect_check_interleaving.
394 Check if DRA and DRB are a part of interleaving. In case they are, insert
395 DRA and DRB in an interleaving chain. */
397 static bool
398 vect_check_interleaving (struct data_reference *dra,
399 struct data_reference *drb)
401 HOST_WIDE_INT type_size_a, type_size_b, diff_mod_size, step, init_a, init_b;
403 /* Check that the data-refs have same first location (except init) and they
404 are both either store or load (not load and store). */
405 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
406 || !dr_equal_offsets_p (dra, drb)
407 || !tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb))
408 || DR_IS_READ (dra) != DR_IS_READ (drb))
409 return false;
411 /* Check:
412 1. data-refs are of the same type
413 2. their steps are equal
414 3. the step (if greater than zero) is greater than the difference between
415 data-refs' inits. */
416 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
417 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
419 if (type_size_a != type_size_b
420 || tree_int_cst_compare (DR_STEP (dra), DR_STEP (drb))
421 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
422 TREE_TYPE (DR_REF (drb))))
423 return false;
425 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
426 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
427 step = TREE_INT_CST_LOW (DR_STEP (dra));
429 if (init_a > init_b)
431 /* If init_a == init_b + the size of the type * k, we have an interleaving,
432 and DRB is accessed before DRA. */
433 diff_mod_size = (init_a - init_b) % type_size_a;
435 if (step && (init_a - init_b) > step)
436 return false;
438 if (diff_mod_size == 0)
440 vect_update_interleaving_chain (drb, dra);
441 if (dump_enabled_p ())
443 dump_printf_loc (MSG_NOTE, vect_location,
444 "Detected interleaving ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
446 dump_printf (MSG_NOTE, " and ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
449 return true;
452 else
454 /* If init_b == init_a + the size of the type * k, we have an
455 interleaving, and DRA is accessed before DRB. */
456 diff_mod_size = (init_b - init_a) % type_size_a;
458 if (step && (init_b - init_a) > step)
459 return false;
461 if (diff_mod_size == 0)
463 vect_update_interleaving_chain (dra, drb);
464 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE, vect_location,
467 "Detected interleaving ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
469 dump_printf (MSG_NOTE, " and ");
470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
472 return true;
476 return false;
479 /* Check if data references pointed by DR_I and DR_J are same or
480 belong to same interleaving group. Return FALSE if drs are
481 different, otherwise return TRUE. */
483 static bool
484 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
486 gimple stmt_i = DR_STMT (dr_i);
487 gimple stmt_j = DR_STMT (dr_j);
489 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
490 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
491 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
492 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
493 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
494 return true;
495 else
496 return false;
499 /* If address ranges represented by DDR_I and DDR_J are equal,
500 return TRUE, otherwise return FALSE. */
502 static bool
503 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
505 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
506 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
507 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
508 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
509 return true;
510 else
511 return false;
514 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
515 tested at run-time. Return TRUE if DDR was successfully inserted.
516 Return false if versioning is not supported. */
518 static bool
519 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
521 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
523 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
524 return false;
526 if (dump_enabled_p ())
528 dump_printf_loc (MSG_NOTE, vect_location,
529 "mark for run-time aliasing test between ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
531 dump_printf (MSG_NOTE, " and ");
532 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
535 if (optimize_loop_nest_for_size_p (loop))
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
539 "versioning not supported when optimizing for size.");
540 return false;
543 /* FORNOW: We don't support versioning with outer-loop vectorization. */
544 if (loop->inner)
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
548 "versioning not yet supported for outer-loops.");
549 return false;
552 /* FORNOW: We don't support creating runtime alias tests for non-constant
553 step. */
554 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
555 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
559 "versioning not yet supported for non-constant "
560 "step");
561 return false;
564 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
565 return true;
569 /* Function vect_analyze_data_ref_dependence.
571 Return TRUE if there (might) exist a dependence between a memory-reference
572 DRA and a memory-reference DRB. When versioning for alias may check a
573 dependence at run-time, return FALSE. Adjust *MAX_VF according to
574 the data dependence. */
576 static bool
577 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
578 loop_vec_info loop_vinfo, int *max_vf)
580 unsigned int i;
581 struct loop *loop = NULL;
582 struct data_reference *dra = DDR_A (ddr);
583 struct data_reference *drb = DDR_B (ddr);
584 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
585 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
586 lambda_vector dist_v;
587 unsigned int loop_depth;
589 /* Don't bother to analyze statements marked as unvectorizable. */
590 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
591 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
592 return false;
594 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
596 /* Independent data accesses. */
597 vect_check_interleaving (dra, drb);
598 return false;
601 if (loop_vinfo)
602 loop = LOOP_VINFO_LOOP (loop_vinfo);
604 if ((DR_IS_READ (dra) && DR_IS_READ (drb) && loop_vinfo) || dra == drb)
605 return false;
607 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
609 gimple earlier_stmt;
611 if (loop_vinfo)
613 if (dump_enabled_p ())
615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
616 "versioning for alias required: "
617 "can't determine dependence between ");
618 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
619 DR_REF (dra));
620 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
621 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
622 DR_REF (drb));
625 /* Add to list of ddrs that need to be tested at run-time. */
626 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
629 /* When vectorizing a basic block unknown depnedence can still mean
630 grouped access. */
631 if (vect_check_interleaving (dra, drb))
632 return false;
634 /* Read-read is OK (we need this check here, after checking for
635 interleaving). */
636 if (DR_IS_READ (dra) && DR_IS_READ (drb))
637 return false;
639 if (dump_enabled_p ())
641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 "can't determine dependence between ");
643 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
644 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
645 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
648 /* We do not vectorize basic blocks with write-write dependencies. */
649 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
650 return true;
652 /* Check that it's not a load-after-store dependence. */
653 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
654 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
655 return true;
657 return false;
660 /* Versioning for alias is not yet supported for basic block SLP, and
661 dependence distance is unapplicable, hence, in case of known data
662 dependence, basic block vectorization is impossible for now. */
663 if (!loop_vinfo)
665 if (dra != drb && vect_check_interleaving (dra, drb))
666 return false;
668 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE, vect_location,
671 "determined dependence between ");
672 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
673 dump_printf (MSG_NOTE, " and ");
674 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
677 /* Do not vectorize basic blcoks with write-write dependences. */
678 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
679 return true;
681 /* Check if this dependence is allowed in basic block vectorization. */
682 return vect_drs_dependent_in_basic_block (dra, drb);
685 /* Loop-based vectorization and known data dependence. */
686 if (DDR_NUM_DIST_VECTS (ddr) == 0)
688 if (dump_enabled_p ())
690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
691 "versioning for alias required: "
692 "bad dist vector for ");
693 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
694 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
695 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
697 /* Add to list of ddrs that need to be tested at run-time. */
698 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
701 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
702 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
704 int dist = dist_v[loop_depth];
706 if (dump_enabled_p ())
707 dump_printf_loc (MSG_NOTE, vect_location,
708 "dependence distance = %d.", dist);
710 if (dist == 0)
712 if (dump_enabled_p ())
714 dump_printf_loc (MSG_NOTE, vect_location,
715 "dependence distance == 0 between ");
716 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
717 dump_printf (MSG_NOTE, " and ");
718 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
721 /* For interleaving, mark that there is a read-write dependency if
722 necessary. We check before that one of the data-refs is store. */
723 if (DR_IS_READ (dra))
724 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_a) = true;
725 else
727 if (DR_IS_READ (drb))
728 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_b) = true;
731 continue;
734 if (dist > 0 && DDR_REVERSED_P (ddr))
736 /* If DDR_REVERSED_P the order of the data-refs in DDR was
737 reversed (to make distance vector positive), and the actual
738 distance is negative. */
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
741 "dependence distance negative.");
742 /* Record a negative dependence distance to later limit the
743 amount of stmt copying / unrolling we can perform.
744 Only need to handle read-after-write dependence. */
745 if (DR_IS_READ (drb)
746 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
747 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
748 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
749 continue;
752 if (abs (dist) >= 2
753 && abs (dist) < *max_vf)
755 /* The dependence distance requires reduction of the maximal
756 vectorization factor. */
757 *max_vf = abs (dist);
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_NOTE, vect_location,
760 "adjusting maximal vectorization factor to %i",
761 *max_vf);
764 if (abs (dist) >= *max_vf)
766 /* Dependence distance does not create dependence, as far as
767 vectorization is concerned, in this case. */
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE, vect_location,
770 "dependence distance >= VF.");
771 continue;
774 if (dump_enabled_p ())
776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
777 "not vectorized, possible dependence "
778 "between data-refs ");
779 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
780 dump_printf (MSG_NOTE, " and ");
781 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
784 return true;
787 return false;
790 /* Function vect_analyze_data_ref_dependences.
792 Examine all the data references in the loop, and make sure there do not
793 exist any data dependences between them. Set *MAX_VF according to
794 the maximum vectorization factor the data dependences allow. */
796 bool
797 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
798 bb_vec_info bb_vinfo, int *max_vf)
800 unsigned int i;
801 vec<ddr_p> ddrs = vNULL;
802 struct data_dependence_relation *ddr;
804 if (dump_enabled_p ())
805 dump_printf_loc (MSG_NOTE, vect_location,
806 "=== vect_analyze_dependences ===");
807 if (loop_vinfo)
808 ddrs = LOOP_VINFO_DDRS (loop_vinfo);
809 else
810 ddrs = BB_VINFO_DDRS (bb_vinfo);
812 FOR_EACH_VEC_ELT (ddrs, i, ddr)
813 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
814 return false;
816 return true;
820 /* Function vect_compute_data_ref_alignment
822 Compute the misalignment of the data reference DR.
824 Output:
825 1. If during the misalignment computation it is found that the data reference
826 cannot be vectorized then false is returned.
827 2. DR_MISALIGNMENT (DR) is defined.
829 FOR NOW: No analysis is actually performed. Misalignment is calculated
830 only for trivial cases. TODO. */
832 static bool
833 vect_compute_data_ref_alignment (struct data_reference *dr)
835 gimple stmt = DR_STMT (dr);
836 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
837 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
838 struct loop *loop = NULL;
839 tree ref = DR_REF (dr);
840 tree vectype;
841 tree base, base_addr;
842 bool base_aligned;
843 tree misalign;
844 tree aligned_to, alignment;
846 if (dump_enabled_p ())
847 dump_printf_loc (MSG_NOTE, vect_location,
848 "vect_compute_data_ref_alignment:");
850 if (loop_vinfo)
851 loop = LOOP_VINFO_LOOP (loop_vinfo);
853 /* Initialize misalignment to unknown. */
854 SET_DR_MISALIGNMENT (dr, -1);
856 /* Strided loads perform only component accesses, misalignment information
857 is irrelevant for them. */
858 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
859 return true;
861 misalign = DR_INIT (dr);
862 aligned_to = DR_ALIGNED_TO (dr);
863 base_addr = DR_BASE_ADDRESS (dr);
864 vectype = STMT_VINFO_VECTYPE (stmt_info);
866 /* In case the dataref is in an inner-loop of the loop that is being
867 vectorized (LOOP), we use the base and misalignment information
868 relative to the outer-loop (LOOP). This is ok only if the misalignment
869 stays the same throughout the execution of the inner-loop, which is why
870 we have to check that the stride of the dataref in the inner-loop evenly
871 divides by the vector size. */
872 if (loop && nested_in_vect_loop_p (loop, stmt))
874 tree step = DR_STEP (dr);
875 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
877 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_NOTE, vect_location,
881 "inner step divides the vector-size.");
882 misalign = STMT_VINFO_DR_INIT (stmt_info);
883 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
884 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
886 else
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
890 "inner step doesn't divide the vector-size.");
891 misalign = NULL_TREE;
895 /* Similarly, if we're doing basic-block vectorization, we can only use
896 base and misalignment information relative to an innermost loop if the
897 misalignment stays the same throughout the execution of the loop.
898 As above, this is the case if the stride of the dataref evenly divides
899 by the vector size. */
900 if (!loop)
902 tree step = DR_STEP (dr);
903 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
905 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
907 if (dump_enabled_p ())
908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 "SLP: step doesn't divide the vector-size.");
910 misalign = NULL_TREE;
914 base = build_fold_indirect_ref (base_addr);
915 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
917 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
918 || !misalign)
920 if (dump_enabled_p ())
922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
923 "Unknown alignment for access: ");
924 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
926 return true;
929 if ((DECL_P (base)
930 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
931 alignment) >= 0)
932 || (TREE_CODE (base_addr) == SSA_NAME
933 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
934 TREE_TYPE (base_addr)))),
935 alignment) >= 0)
936 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
937 base_aligned = true;
938 else
939 base_aligned = false;
941 if (!base_aligned)
943 /* Do not change the alignment of global variables here if
944 flag_section_anchors is enabled as we already generated
945 RTL for other functions. Most global variables should
946 have been aligned during the IPA increase_alignment pass. */
947 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
948 || (TREE_STATIC (base) && flag_section_anchors))
950 if (dump_enabled_p ())
952 dump_printf_loc (MSG_NOTE, vect_location,
953 "can't force alignment of ref: ");
954 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
956 return true;
959 /* Force the alignment of the decl.
960 NOTE: This is the only change to the code we make during
961 the analysis phase, before deciding to vectorize the loop. */
962 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
965 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
968 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
969 DECL_USER_ALIGN (base) = 1;
972 /* At this point we assume that the base is aligned. */
973 gcc_assert (base_aligned
974 || (TREE_CODE (base) == VAR_DECL
975 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
977 /* If this is a backward running DR then first access in the larger
978 vectype actually is N-1 elements before the address in the DR.
979 Adjust misalign accordingly. */
980 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
982 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
983 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
984 otherwise we wouldn't be here. */
985 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
986 /* PLUS because DR_STEP was negative. */
987 misalign = size_binop (PLUS_EXPR, misalign, offset);
990 /* Modulo alignment. */
991 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
993 if (!host_integerp (misalign, 1))
995 /* Negative or overflowed misalignment value. */
996 if (dump_enabled_p ())
997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
998 "unexpected misalign value");
999 return false;
1002 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1008 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1011 return true;
1015 /* Function vect_compute_data_refs_alignment
1017 Compute the misalignment of data references in the loop.
1018 Return FALSE if a data reference is found that cannot be vectorized. */
1020 static bool
1021 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
1022 bb_vec_info bb_vinfo)
1024 vec<data_reference_p> datarefs;
1025 struct data_reference *dr;
1026 unsigned int i;
1028 if (loop_vinfo)
1029 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1030 else
1031 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
1033 FOR_EACH_VEC_ELT (datarefs, i, dr)
1034 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
1035 && !vect_compute_data_ref_alignment (dr))
1037 if (bb_vinfo)
1039 /* Mark unsupported statement as unvectorizable. */
1040 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
1041 continue;
1043 else
1044 return false;
1047 return true;
1051 /* Function vect_update_misalignment_for_peel
1053 DR - the data reference whose misalignment is to be adjusted.
1054 DR_PEEL - the data reference whose misalignment is being made
1055 zero in the vector loop by the peel.
1056 NPEEL - the number of iterations in the peel loop if the misalignment
1057 of DR_PEEL is known at compile time. */
1059 static void
1060 vect_update_misalignment_for_peel (struct data_reference *dr,
1061 struct data_reference *dr_peel, int npeel)
1063 unsigned int i;
1064 vec<dr_p> same_align_drs;
1065 struct data_reference *current_dr;
1066 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
1067 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
1068 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1069 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1071 /* For interleaved data accesses the step in the loop must be multiplied by
1072 the size of the interleaving group. */
1073 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1074 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1075 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1076 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1078 /* It can be assumed that the data refs with the same alignment as dr_peel
1079 are aligned in the vector loop. */
1080 same_align_drs
1081 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1082 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
1084 if (current_dr != dr)
1085 continue;
1086 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
1087 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
1088 SET_DR_MISALIGNMENT (dr, 0);
1089 return;
1092 if (known_alignment_for_access_p (dr)
1093 && known_alignment_for_access_p (dr_peel))
1095 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1096 int misal = DR_MISALIGNMENT (dr);
1097 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1098 misal += negative ? -npeel * dr_size : npeel * dr_size;
1099 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1100 SET_DR_MISALIGNMENT (dr, misal);
1101 return;
1104 if (dump_enabled_p ())
1105 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
1106 SET_DR_MISALIGNMENT (dr, -1);
1110 /* Function vect_verify_datarefs_alignment
1112 Return TRUE if all data references in the loop can be
1113 handled with respect to alignment. */
1115 bool
1116 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1118 vec<data_reference_p> datarefs;
1119 struct data_reference *dr;
1120 enum dr_alignment_support supportable_dr_alignment;
1121 unsigned int i;
1123 if (loop_vinfo)
1124 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1125 else
1126 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
1128 FOR_EACH_VEC_ELT (datarefs, i, dr)
1130 gimple stmt = DR_STMT (dr);
1131 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1133 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1134 continue;
1136 /* For interleaving, only the alignment of the first access matters.
1137 Skip statements marked as not vectorizable. */
1138 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
1139 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1140 || !STMT_VINFO_VECTORIZABLE (stmt_info))
1141 continue;
1143 /* Strided loads perform only component accesses, alignment is
1144 irrelevant for them. */
1145 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1146 continue;
1148 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1149 if (!supportable_dr_alignment)
1151 if (dump_enabled_p ())
1153 if (DR_IS_READ (dr))
1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155 "not vectorized: unsupported unaligned load.");
1156 else
1157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1158 "not vectorized: unsupported unaligned "
1159 "store.");
1161 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1162 DR_REF (dr));
1164 return false;
1166 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1167 dump_printf_loc (MSG_NOTE, vect_location,
1168 "Vectorizing an unaligned access.");
1170 return true;
1173 /* Given an memory reference EXP return whether its alignment is less
1174 than its size. */
1176 static bool
1177 not_size_aligned (tree exp)
1179 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
1180 return true;
1182 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
1183 > get_object_alignment (exp));
1186 /* Function vector_alignment_reachable_p
1188 Return true if vector alignment for DR is reachable by peeling
1189 a few loop iterations. Return false otherwise. */
1191 static bool
1192 vector_alignment_reachable_p (struct data_reference *dr)
1194 gimple stmt = DR_STMT (dr);
1195 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1196 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1198 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1200 /* For interleaved access we peel only if number of iterations in
1201 the prolog loop ({VF - misalignment}), is a multiple of the
1202 number of the interleaved accesses. */
1203 int elem_size, mis_in_elements;
1204 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1206 /* FORNOW: handle only known alignment. */
1207 if (!known_alignment_for_access_p (dr))
1208 return false;
1210 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1211 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1213 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1214 return false;
1217 /* If misalignment is known at the compile time then allow peeling
1218 only if natural alignment is reachable through peeling. */
1219 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1221 HOST_WIDE_INT elmsize =
1222 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1223 if (dump_enabled_p ())
1225 dump_printf_loc (MSG_NOTE, vect_location,
1226 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1227 dump_printf (MSG_NOTE,
1228 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1230 if (DR_MISALIGNMENT (dr) % elmsize)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "data size does not divide the misalignment.\n");
1235 return false;
1239 if (!known_alignment_for_access_p (dr))
1241 tree type = TREE_TYPE (DR_REF (dr));
1242 bool is_packed = not_size_aligned (DR_REF (dr));
1243 if (dump_enabled_p ())
1244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1245 "Unknown misalignment, is_packed = %d",is_packed);
1246 if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
1247 return true;
1248 else
1249 return false;
1252 return true;
1256 /* Calculate the cost of the memory access represented by DR. */
1258 static void
1259 vect_get_data_access_cost (struct data_reference *dr,
1260 unsigned int *inside_cost,
1261 unsigned int *outside_cost,
1262 stmt_vector_for_cost *body_cost_vec)
1264 gimple stmt = DR_STMT (dr);
1265 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1266 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1267 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1268 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1269 int ncopies = vf / nunits;
1271 if (DR_IS_READ (dr))
1272 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1273 NULL, body_cost_vec, false);
1274 else
1275 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_NOTE, vect_location,
1279 "vect_get_data_access_cost: inside_cost = %d, "
1280 "outside_cost = %d.", *inside_cost, *outside_cost);
1284 static hashval_t
1285 vect_peeling_hash (const void *elem)
1287 const struct _vect_peel_info *peel_info;
1289 peel_info = (const struct _vect_peel_info *) elem;
1290 return (hashval_t) peel_info->npeel;
1294 static int
1295 vect_peeling_hash_eq (const void *elem1, const void *elem2)
1297 const struct _vect_peel_info *a, *b;
1299 a = (const struct _vect_peel_info *) elem1;
1300 b = (const struct _vect_peel_info *) elem2;
1301 return (a->npeel == b->npeel);
1305 /* Insert DR into peeling hash table with NPEEL as key. */
1307 static void
1308 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1309 int npeel)
1311 struct _vect_peel_info elem, *slot;
1312 void **new_slot;
1313 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1315 elem.npeel = npeel;
1316 slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1317 &elem);
1318 if (slot)
1319 slot->count++;
1320 else
1322 slot = XNEW (struct _vect_peel_info);
1323 slot->npeel = npeel;
1324 slot->dr = dr;
1325 slot->count = 1;
1326 new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
1327 INSERT);
1328 *new_slot = slot;
1331 if (!supportable_dr_alignment && !flag_vect_cost_model)
1332 slot->count += VECT_MAX_COST;
1336 /* Traverse peeling hash table to find peeling option that aligns maximum
1337 number of data accesses. */
1339 static int
1340 vect_peeling_hash_get_most_frequent (void **slot, void *data)
1342 vect_peel_info elem = (vect_peel_info) *slot;
1343 vect_peel_extended_info max = (vect_peel_extended_info) data;
1345 if (elem->count > max->peel_info.count
1346 || (elem->count == max->peel_info.count
1347 && max->peel_info.npeel > elem->npeel))
1349 max->peel_info.npeel = elem->npeel;
1350 max->peel_info.count = elem->count;
1351 max->peel_info.dr = elem->dr;
1354 return 1;
1358 /* Traverse peeling hash table and calculate cost for each peeling option.
1359 Find the one with the lowest cost. */
1361 static int
1362 vect_peeling_hash_get_lowest_cost (void **slot, void *data)
1364 vect_peel_info elem = (vect_peel_info) *slot;
1365 vect_peel_extended_info min = (vect_peel_extended_info) data;
1366 int save_misalignment, dummy;
1367 unsigned int inside_cost = 0, outside_cost = 0, i;
1368 gimple stmt = DR_STMT (elem->dr);
1369 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1370 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1371 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1372 struct data_reference *dr;
1373 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1374 int single_iter_cost;
1376 prologue_cost_vec.create (2);
1377 body_cost_vec.create (2);
1378 epilogue_cost_vec.create (2);
1380 FOR_EACH_VEC_ELT (datarefs, i, dr)
1382 stmt = DR_STMT (dr);
1383 stmt_info = vinfo_for_stmt (stmt);
1384 /* For interleaving, only the alignment of the first access
1385 matters. */
1386 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1387 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1388 continue;
1390 save_misalignment = DR_MISALIGNMENT (dr);
1391 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1392 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1393 &body_cost_vec);
1394 SET_DR_MISALIGNMENT (dr, save_misalignment);
1397 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1398 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1399 &dummy, single_iter_cost,
1400 &prologue_cost_vec,
1401 &epilogue_cost_vec);
1403 /* Prologue and epilogue costs are added to the target model later.
1404 These costs depend only on the scalar iteration cost, the
1405 number of peeling iterations finally chosen, and the number of
1406 misaligned statements. So discard the information found here. */
1407 prologue_cost_vec.release ();
1408 epilogue_cost_vec.release ();
1410 if (inside_cost < min->inside_cost
1411 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1413 min->inside_cost = inside_cost;
1414 min->outside_cost = outside_cost;
1415 min->body_cost_vec.release ();
1416 min->body_cost_vec = body_cost_vec;
1417 min->peel_info.dr = elem->dr;
1418 min->peel_info.npeel = elem->npeel;
1420 else
1421 body_cost_vec.release ();
1423 return 1;
1427 /* Choose best peeling option by traversing peeling hash table and either
1428 choosing an option with the lowest cost (if cost model is enabled) or the
1429 option that aligns as many accesses as possible. */
1431 static struct data_reference *
1432 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1433 unsigned int *npeel,
1434 stmt_vector_for_cost *body_cost_vec)
1436 struct _vect_peel_extended_info res;
1438 res.peel_info.dr = NULL;
1439 res.body_cost_vec = stmt_vector_for_cost();
1441 if (flag_vect_cost_model)
1443 res.inside_cost = INT_MAX;
1444 res.outside_cost = INT_MAX;
1445 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1446 vect_peeling_hash_get_lowest_cost, &res);
1448 else
1450 res.peel_info.count = 0;
1451 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1452 vect_peeling_hash_get_most_frequent, &res);
1455 *npeel = res.peel_info.npeel;
1456 *body_cost_vec = res.body_cost_vec;
1457 return res.peel_info.dr;
1461 /* Function vect_enhance_data_refs_alignment
1463 This pass will use loop versioning and loop peeling in order to enhance
1464 the alignment of data references in the loop.
1466 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1467 original loop is to be vectorized. Any other loops that are created by
1468 the transformations performed in this pass - are not supposed to be
1469 vectorized. This restriction will be relaxed.
1471 This pass will require a cost model to guide it whether to apply peeling
1472 or versioning or a combination of the two. For example, the scheme that
1473 intel uses when given a loop with several memory accesses, is as follows:
1474 choose one memory access ('p') which alignment you want to force by doing
1475 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1476 other accesses are not necessarily aligned, or (2) use loop versioning to
1477 generate one loop in which all accesses are aligned, and another loop in
1478 which only 'p' is necessarily aligned.
1480 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1481 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1482 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1484 Devising a cost model is the most critical aspect of this work. It will
1485 guide us on which access to peel for, whether to use loop versioning, how
1486 many versions to create, etc. The cost model will probably consist of
1487 generic considerations as well as target specific considerations (on
1488 powerpc for example, misaligned stores are more painful than misaligned
1489 loads).
1491 Here are the general steps involved in alignment enhancements:
1493 -- original loop, before alignment analysis:
1494 for (i=0; i<N; i++){
1495 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1496 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1499 -- After vect_compute_data_refs_alignment:
1500 for (i=0; i<N; i++){
1501 x = q[i]; # DR_MISALIGNMENT(q) = 3
1502 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1505 -- Possibility 1: we do loop versioning:
1506 if (p is aligned) {
1507 for (i=0; i<N; i++){ # loop 1A
1508 x = q[i]; # DR_MISALIGNMENT(q) = 3
1509 p[i] = y; # DR_MISALIGNMENT(p) = 0
1512 else {
1513 for (i=0; i<N; i++){ # loop 1B
1514 x = q[i]; # DR_MISALIGNMENT(q) = 3
1515 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1519 -- Possibility 2: we do loop peeling:
1520 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1521 x = q[i];
1522 p[i] = y;
1524 for (i = 3; i < N; i++){ # loop 2A
1525 x = q[i]; # DR_MISALIGNMENT(q) = 0
1526 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1529 -- Possibility 3: combination of loop peeling and versioning:
1530 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1531 x = q[i];
1532 p[i] = y;
1534 if (p is aligned) {
1535 for (i = 3; i<N; i++){ # loop 3A
1536 x = q[i]; # DR_MISALIGNMENT(q) = 0
1537 p[i] = y; # DR_MISALIGNMENT(p) = 0
1540 else {
1541 for (i = 3; i<N; i++){ # loop 3B
1542 x = q[i]; # DR_MISALIGNMENT(q) = 0
1543 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1547 These loops are later passed to loop_transform to be vectorized. The
1548 vectorizer will use the alignment information to guide the transformation
1549 (whether to generate regular loads/stores, or with special handling for
1550 misalignment). */
1552 bool
1553 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1555 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1556 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1557 enum dr_alignment_support supportable_dr_alignment;
1558 struct data_reference *dr0 = NULL, *first_store = NULL;
1559 struct data_reference *dr;
1560 unsigned int i, j;
1561 bool do_peeling = false;
1562 bool do_versioning = false;
1563 bool stat;
1564 gimple stmt;
1565 stmt_vec_info stmt_info;
1566 int vect_versioning_for_alias_required;
1567 unsigned int npeel = 0;
1568 bool all_misalignments_unknown = true;
1569 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1570 unsigned possible_npeel_number = 1;
1571 tree vectype;
1572 unsigned int nelements, mis, same_align_drs_max = 0;
1573 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_NOTE, vect_location,
1577 "=== vect_enhance_data_refs_alignment ===");
1579 /* While cost model enhancements are expected in the future, the high level
1580 view of the code at this time is as follows:
1582 A) If there is a misaligned access then see if peeling to align
1583 this access can make all data references satisfy
1584 vect_supportable_dr_alignment. If so, update data structures
1585 as needed and return true.
1587 B) If peeling wasn't possible and there is a data reference with an
1588 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1589 then see if loop versioning checks can be used to make all data
1590 references satisfy vect_supportable_dr_alignment. If so, update
1591 data structures as needed and return true.
1593 C) If neither peeling nor versioning were successful then return false if
1594 any data reference does not satisfy vect_supportable_dr_alignment.
1596 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1598 Note, Possibility 3 above (which is peeling and versioning together) is not
1599 being done at this time. */
1601 /* (1) Peeling to force alignment. */
1603 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1604 Considerations:
1605 + How many accesses will become aligned due to the peeling
1606 - How many accesses will become unaligned due to the peeling,
1607 and the cost of misaligned accesses.
1608 - The cost of peeling (the extra runtime checks, the increase
1609 in code size). */
1611 FOR_EACH_VEC_ELT (datarefs, i, dr)
1613 stmt = DR_STMT (dr);
1614 stmt_info = vinfo_for_stmt (stmt);
1616 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1617 continue;
1619 /* For interleaving, only the alignment of the first access
1620 matters. */
1621 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1622 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1623 continue;
1625 /* For invariant accesses there is nothing to enhance. */
1626 if (integer_zerop (DR_STEP (dr)))
1627 continue;
1629 /* Strided loads perform only component accesses, alignment is
1630 irrelevant for them. */
1631 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1632 continue;
1634 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1635 do_peeling = vector_alignment_reachable_p (dr);
1636 if (do_peeling)
1638 if (known_alignment_for_access_p (dr))
1640 unsigned int npeel_tmp;
1641 bool negative = tree_int_cst_compare (DR_STEP (dr),
1642 size_zero_node) < 0;
1644 /* Save info about DR in the hash table. */
1645 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1646 LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
1647 htab_create (1, vect_peeling_hash,
1648 vect_peeling_hash_eq, free);
1650 vectype = STMT_VINFO_VECTYPE (stmt_info);
1651 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1652 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1653 TREE_TYPE (DR_REF (dr))));
1654 npeel_tmp = (negative
1655 ? (mis - nelements) : (nelements - mis))
1656 & (nelements - 1);
1658 /* For multiple types, it is possible that the bigger type access
1659 will have more than one peeling option. E.g., a loop with two
1660 types: one of size (vector size / 4), and the other one of
1661 size (vector size / 8). Vectorization factor will 8. If both
1662 access are misaligned by 3, the first one needs one scalar
1663 iteration to be aligned, and the second one needs 5. But the
1664 the first one will be aligned also by peeling 5 scalar
1665 iterations, and in that case both accesses will be aligned.
1666 Hence, except for the immediate peeling amount, we also want
1667 to try to add full vector size, while we don't exceed
1668 vectorization factor.
1669 We do this automtically for cost model, since we calculate cost
1670 for every peeling option. */
1671 if (!flag_vect_cost_model)
1672 possible_npeel_number = vf /nelements;
1674 /* Handle the aligned case. We may decide to align some other
1675 access, making DR unaligned. */
1676 if (DR_MISALIGNMENT (dr) == 0)
1678 npeel_tmp = 0;
1679 if (!flag_vect_cost_model)
1680 possible_npeel_number++;
1683 for (j = 0; j < possible_npeel_number; j++)
1685 gcc_assert (npeel_tmp <= vf);
1686 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1687 npeel_tmp += nelements;
1690 all_misalignments_unknown = false;
1691 /* Data-ref that was chosen for the case that all the
1692 misalignments are unknown is not relevant anymore, since we
1693 have a data-ref with known alignment. */
1694 dr0 = NULL;
1696 else
1698 /* If we don't know all the misalignment values, we prefer
1699 peeling for data-ref that has maximum number of data-refs
1700 with the same alignment, unless the target prefers to align
1701 stores over load. */
1702 if (all_misalignments_unknown)
1704 if (same_align_drs_max
1705 < STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ()
1706 || !dr0)
1708 same_align_drs_max
1709 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1710 dr0 = dr;
1713 if (!first_store && DR_IS_WRITE (dr))
1714 first_store = dr;
1717 /* If there are both known and unknown misaligned accesses in the
1718 loop, we choose peeling amount according to the known
1719 accesses. */
1722 if (!supportable_dr_alignment)
1724 dr0 = dr;
1725 if (!first_store && DR_IS_WRITE (dr))
1726 first_store = dr;
1730 else
1732 if (!aligned_access_p (dr))
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1736 "vector alignment may not be reachable");
1737 break;
1742 vect_versioning_for_alias_required
1743 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1745 /* Temporarily, if versioning for alias is required, we disable peeling
1746 until we support peeling and versioning. Often peeling for alignment
1747 will require peeling for loop-bound, which in turn requires that we
1748 know how to adjust the loop ivs after the loop. */
1749 if (vect_versioning_for_alias_required
1750 || !vect_can_advance_ivs_p (loop_vinfo)
1751 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1752 do_peeling = false;
1754 if (do_peeling && all_misalignments_unknown
1755 && vect_supportable_dr_alignment (dr0, false))
1758 /* Check if the target requires to prefer stores over loads, i.e., if
1759 misaligned stores are more expensive than misaligned loads (taking
1760 drs with same alignment into account). */
1761 if (first_store && DR_IS_READ (dr0))
1763 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1764 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1765 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1766 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1767 stmt_vector_for_cost dummy;
1768 dummy.create (2);
1770 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1771 &dummy);
1772 vect_get_data_access_cost (first_store, &store_inside_cost,
1773 &store_outside_cost, &dummy);
1775 dummy.release ();
1777 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1778 aligning the load DR0). */
1779 load_inside_penalty = store_inside_cost;
1780 load_outside_penalty = store_outside_cost;
1781 for (i = 0;
1782 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1783 DR_STMT (first_store))).iterate (i, &dr);
1784 i++)
1785 if (DR_IS_READ (dr))
1787 load_inside_penalty += load_inside_cost;
1788 load_outside_penalty += load_outside_cost;
1790 else
1792 load_inside_penalty += store_inside_cost;
1793 load_outside_penalty += store_outside_cost;
1796 /* Calculate the penalty for leaving DR0 unaligned (by
1797 aligning the FIRST_STORE). */
1798 store_inside_penalty = load_inside_cost;
1799 store_outside_penalty = load_outside_cost;
1800 for (i = 0;
1801 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1802 DR_STMT (dr0))).iterate (i, &dr);
1803 i++)
1804 if (DR_IS_READ (dr))
1806 store_inside_penalty += load_inside_cost;
1807 store_outside_penalty += load_outside_cost;
1809 else
1811 store_inside_penalty += store_inside_cost;
1812 store_outside_penalty += store_outside_cost;
1815 if (load_inside_penalty > store_inside_penalty
1816 || (load_inside_penalty == store_inside_penalty
1817 && load_outside_penalty > store_outside_penalty))
1818 dr0 = first_store;
1821 /* In case there are only loads with different unknown misalignments, use
1822 peeling only if it may help to align other accesses in the loop. */
1823 if (!first_store
1824 && !STMT_VINFO_SAME_ALIGN_REFS (
1825 vinfo_for_stmt (DR_STMT (dr0))).length ()
1826 && vect_supportable_dr_alignment (dr0, false)
1827 != dr_unaligned_supported)
1828 do_peeling = false;
1831 if (do_peeling && !dr0)
1833 /* Peeling is possible, but there is no data access that is not supported
1834 unless aligned. So we try to choose the best possible peeling. */
1836 /* We should get here only if there are drs with known misalignment. */
1837 gcc_assert (!all_misalignments_unknown);
1839 /* Choose the best peeling from the hash table. */
1840 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1841 &body_cost_vec);
1842 if (!dr0 || !npeel)
1843 do_peeling = false;
1846 if (do_peeling)
1848 stmt = DR_STMT (dr0);
1849 stmt_info = vinfo_for_stmt (stmt);
1850 vectype = STMT_VINFO_VECTYPE (stmt_info);
1851 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1853 if (known_alignment_for_access_p (dr0))
1855 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1856 size_zero_node) < 0;
1857 if (!npeel)
1859 /* Since it's known at compile time, compute the number of
1860 iterations in the peeled loop (the peeling factor) for use in
1861 updating DR_MISALIGNMENT values. The peeling factor is the
1862 vectorization factor minus the misalignment as an element
1863 count. */
1864 mis = DR_MISALIGNMENT (dr0);
1865 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1866 npeel = ((negative ? mis - nelements : nelements - mis)
1867 & (nelements - 1));
1870 /* For interleaved data access every iteration accesses all the
1871 members of the group, therefore we divide the number of iterations
1872 by the group size. */
1873 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1874 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1875 npeel /= GROUP_SIZE (stmt_info);
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_NOTE, vect_location,
1879 "Try peeling by %d", npeel);
1882 /* Ensure that all data refs can be vectorized after the peel. */
1883 FOR_EACH_VEC_ELT (datarefs, i, dr)
1885 int save_misalignment;
1887 if (dr == dr0)
1888 continue;
1890 stmt = DR_STMT (dr);
1891 stmt_info = vinfo_for_stmt (stmt);
1892 /* For interleaving, only the alignment of the first access
1893 matters. */
1894 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1895 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1896 continue;
1898 /* Strided loads perform only component accesses, alignment is
1899 irrelevant for them. */
1900 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1901 continue;
1903 save_misalignment = DR_MISALIGNMENT (dr);
1904 vect_update_misalignment_for_peel (dr, dr0, npeel);
1905 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1906 SET_DR_MISALIGNMENT (dr, save_misalignment);
1908 if (!supportable_dr_alignment)
1910 do_peeling = false;
1911 break;
1915 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1917 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1918 if (!stat)
1919 do_peeling = false;
1920 else
1922 body_cost_vec.release ();
1923 return stat;
1927 if (do_peeling)
1929 stmt_info_for_cost *si;
1930 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1932 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1933 If the misalignment of DR_i is identical to that of dr0 then set
1934 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1935 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1936 by the peeling factor times the element size of DR_i (MOD the
1937 vectorization factor times the size). Otherwise, the
1938 misalignment of DR_i must be set to unknown. */
1939 FOR_EACH_VEC_ELT (datarefs, i, dr)
1940 if (dr != dr0)
1941 vect_update_misalignment_for_peel (dr, dr0, npeel);
1943 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1944 if (npeel)
1945 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1946 else
1947 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1948 SET_DR_MISALIGNMENT (dr0, 0);
1949 if (dump_enabled_p ())
1951 dump_printf_loc (MSG_NOTE, vect_location,
1952 "Alignment of access forced using peeling.");
1953 dump_printf_loc (MSG_NOTE, vect_location,
1954 "Peeling for alignment will be applied.");
1956 /* We've delayed passing the inside-loop peeling costs to the
1957 target cost model until we were sure peeling would happen.
1958 Do so now. */
1959 if (body_cost_vec.exists ())
1961 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1963 struct _stmt_vec_info *stmt_info
1964 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1965 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1966 si->misalign, vect_body);
1968 body_cost_vec.release ();
1971 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1972 gcc_assert (stat);
1973 return stat;
1977 body_cost_vec.release ();
1979 /* (2) Versioning to force alignment. */
1981 /* Try versioning if:
1982 1) flag_tree_vect_loop_version is TRUE
1983 2) optimize loop for speed
1984 3) there is at least one unsupported misaligned data ref with an unknown
1985 misalignment, and
1986 4) all misaligned data refs with a known misalignment are supported, and
1987 5) the number of runtime alignment checks is within reason. */
1989 do_versioning =
1990 flag_tree_vect_loop_version
1991 && optimize_loop_nest_for_speed_p (loop)
1992 && (!loop->inner); /* FORNOW */
1994 if (do_versioning)
1996 FOR_EACH_VEC_ELT (datarefs, i, dr)
1998 stmt = DR_STMT (dr);
1999 stmt_info = vinfo_for_stmt (stmt);
2001 /* For interleaving, only the alignment of the first access
2002 matters. */
2003 if (aligned_access_p (dr)
2004 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2005 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2006 continue;
2008 /* Strided loads perform only component accesses, alignment is
2009 irrelevant for them. */
2010 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
2011 continue;
2013 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2015 if (!supportable_dr_alignment)
2017 gimple stmt;
2018 int mask;
2019 tree vectype;
2021 if (known_alignment_for_access_p (dr)
2022 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2023 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2025 do_versioning = false;
2026 break;
2029 stmt = DR_STMT (dr);
2030 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2031 gcc_assert (vectype);
2033 /* The rightmost bits of an aligned address must be zeros.
2034 Construct the mask needed for this test. For example,
2035 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2036 mask must be 15 = 0xf. */
2037 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2039 /* FORNOW: use the same mask to test all potentially unaligned
2040 references in the loop. The vectorizer currently supports
2041 a single vector size, see the reference to
2042 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2043 vectorization factor is computed. */
2044 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2045 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2046 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2047 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2048 DR_STMT (dr));
2052 /* Versioning requires at least one misaligned data reference. */
2053 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2054 do_versioning = false;
2055 else if (!do_versioning)
2056 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2059 if (do_versioning)
2061 vec<gimple> may_misalign_stmts
2062 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2063 gimple stmt;
2065 /* It can now be assumed that the data references in the statements
2066 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2067 of the loop being vectorized. */
2068 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2070 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2071 dr = STMT_VINFO_DATA_REF (stmt_info);
2072 SET_DR_MISALIGNMENT (dr, 0);
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_NOTE, vect_location,
2075 "Alignment of access forced using versioning.");
2078 if (dump_enabled_p ())
2079 dump_printf_loc (MSG_NOTE, vect_location,
2080 "Versioning for alignment will be applied.");
2082 /* Peeling and versioning can't be done together at this time. */
2083 gcc_assert (! (do_peeling && do_versioning));
2085 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
2086 gcc_assert (stat);
2087 return stat;
2090 /* This point is reached if neither peeling nor versioning is being done. */
2091 gcc_assert (! (do_peeling || do_versioning));
2093 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
2094 return stat;
2098 /* Function vect_find_same_alignment_drs.
2100 Update group and alignment relations according to the chosen
2101 vectorization factor. */
2103 static void
2104 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2105 loop_vec_info loop_vinfo)
2107 unsigned int i;
2108 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2109 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2110 struct data_reference *dra = DDR_A (ddr);
2111 struct data_reference *drb = DDR_B (ddr);
2112 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2113 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2114 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2115 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2116 lambda_vector dist_v;
2117 unsigned int loop_depth;
2119 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2120 return;
2122 if (dra == drb)
2123 return;
2125 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2126 return;
2128 /* Loop-based vectorization and known data dependence. */
2129 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2130 return;
2132 /* Data-dependence analysis reports a distance vector of zero
2133 for data-references that overlap only in the first iteration
2134 but have different sign step (see PR45764).
2135 So as a sanity check require equal DR_STEP. */
2136 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2137 return;
2139 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2140 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2142 int dist = dist_v[loop_depth];
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_NOTE, vect_location,
2146 "dependence distance = %d.", dist);
2148 /* Same loop iteration. */
2149 if (dist == 0
2150 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2152 /* Two references with distance zero have the same alignment. */
2153 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2154 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2155 if (dump_enabled_p ())
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "accesses have the same alignment.");
2159 dump_printf (MSG_NOTE,
2160 "dependence distance modulo vf == 0 between ");
2161 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2162 dump_printf (MSG_NOTE, " and ");
2163 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2170 /* Function vect_analyze_data_refs_alignment
2172 Analyze the alignment of the data-references in the loop.
2173 Return FALSE if a data reference is found that cannot be vectorized. */
2175 bool
2176 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
2177 bb_vec_info bb_vinfo)
2179 if (dump_enabled_p ())
2180 dump_printf_loc (MSG_NOTE, vect_location,
2181 "=== vect_analyze_data_refs_alignment ===");
2183 /* Mark groups of data references with same alignment using
2184 data dependence information. */
2185 if (loop_vinfo)
2187 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2188 struct data_dependence_relation *ddr;
2189 unsigned int i;
2191 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2192 vect_find_same_alignment_drs (ddr, loop_vinfo);
2195 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2197 if (dump_enabled_p ())
2198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2199 "not vectorized: can't calculate alignment "
2200 "for data ref.");
2201 return false;
2204 return true;
2208 /* Analyze groups of accesses: check that DR belongs to a group of
2209 accesses of legal size, step, etc. Detect gaps, single element
2210 interleaving, and other special cases. Set grouped access info.
2211 Collect groups of strided stores for further use in SLP analysis. */
2213 static bool
2214 vect_analyze_group_access (struct data_reference *dr)
2216 tree step = DR_STEP (dr);
2217 tree scalar_type = TREE_TYPE (DR_REF (dr));
2218 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2219 gimple stmt = DR_STMT (dr);
2220 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2221 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2222 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2223 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2224 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2225 bool slp_impossible = false;
2226 struct loop *loop = NULL;
2228 if (loop_vinfo)
2229 loop = LOOP_VINFO_LOOP (loop_vinfo);
2231 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2232 size of the interleaving group (including gaps). */
2233 groupsize = dr_step / type_size;
2235 /* Not consecutive access is possible only if it is a part of interleaving. */
2236 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2238 /* Check if it this DR is a part of interleaving, and is a single
2239 element of the group that is accessed in the loop. */
2241 /* Gaps are supported only for loads. STEP must be a multiple of the type
2242 size. The size of the group must be a power of 2. */
2243 if (DR_IS_READ (dr)
2244 && (dr_step % type_size) == 0
2245 && groupsize > 0
2246 && exact_log2 (groupsize) != -1)
2248 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2249 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2250 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "Detected single element interleaving ");
2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2255 dump_printf (MSG_NOTE, " step ");
2256 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2259 if (loop_vinfo)
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_NOTE, vect_location,
2263 "Data access with gaps requires scalar "
2264 "epilogue loop");
2265 if (loop->inner)
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 "Peeling for outer loop is not"
2270 " supported");
2271 return false;
2274 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2277 return true;
2280 if (dump_enabled_p ())
2282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2283 "not consecutive access ");
2284 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2287 if (bb_vinfo)
2289 /* Mark the statement as unvectorizable. */
2290 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2291 return true;
2294 return false;
2297 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2299 /* First stmt in the interleaving chain. Check the chain. */
2300 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2301 struct data_reference *data_ref = dr;
2302 unsigned int count = 1;
2303 tree next_step;
2304 tree prev_init = DR_INIT (data_ref);
2305 gimple prev = stmt;
2306 HOST_WIDE_INT diff, count_in_bytes, gaps = 0;
2308 while (next)
2310 /* Skip same data-refs. In case that two or more stmts share
2311 data-ref (supported only for loads), we vectorize only the first
2312 stmt, and the rest get their vectorized loads from the first
2313 one. */
2314 if (!tree_int_cst_compare (DR_INIT (data_ref),
2315 DR_INIT (STMT_VINFO_DATA_REF (
2316 vinfo_for_stmt (next)))))
2318 if (DR_IS_WRITE (data_ref))
2320 if (dump_enabled_p ())
2321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2322 "Two store stmts share the same dr.");
2323 return false;
2326 /* Check that there is no load-store dependencies for this loads
2327 to prevent a case of load-store-load to the same location. */
2328 if (GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
2329 || GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "READ_WRITE dependence in interleaving.");
2334 return false;
2337 /* For load use the same data-ref load. */
2338 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2340 prev = next;
2341 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2342 continue;
2345 prev = next;
2347 /* Check that all the accesses have the same STEP. */
2348 next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
2349 if (tree_int_cst_compare (step, next_step))
2351 if (dump_enabled_p ())
2352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2353 "not consecutive access in interleaving");
2354 return false;
2357 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2358 /* Check that the distance between two accesses is equal to the type
2359 size. Otherwise, we have gaps. */
2360 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2361 - TREE_INT_CST_LOW (prev_init)) / type_size;
2362 if (diff != 1)
2364 /* FORNOW: SLP of accesses with gaps is not supported. */
2365 slp_impossible = true;
2366 if (DR_IS_WRITE (data_ref))
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2370 "interleaved store with gaps");
2371 return false;
2374 gaps += diff - 1;
2377 last_accessed_element += diff;
2379 /* Store the gap from the previous member of the group. If there is no
2380 gap in the access, GROUP_GAP is always 1. */
2381 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2383 prev_init = DR_INIT (data_ref);
2384 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2385 /* Count the number of data-refs in the chain. */
2386 count++;
2389 /* COUNT is the number of accesses found, we multiply it by the size of
2390 the type to get COUNT_IN_BYTES. */
2391 count_in_bytes = type_size * count;
2393 /* Check that the size of the interleaving (including gaps) is not
2394 greater than STEP. */
2395 if (dr_step && dr_step < count_in_bytes + gaps * type_size)
2397 if (dump_enabled_p ())
2399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2400 "interleaving size is greater than step for ");
2401 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2403 return false;
2406 /* Check that the size of the interleaving is equal to STEP for stores,
2407 i.e., that there are no gaps. */
2408 if (dr_step && dr_step != count_in_bytes)
2410 if (DR_IS_READ (dr))
2412 slp_impossible = true;
2413 /* There is a gap after the last load in the group. This gap is a
2414 difference between the groupsize and the number of elements.
2415 When there is no gap, this difference should be 0. */
2416 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2418 else
2420 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422 "interleaved store with gaps");
2423 return false;
2427 /* Check that STEP is a multiple of type size. */
2428 if (dr_step && (dr_step % type_size) != 0)
2430 if (dump_enabled_p ())
2432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2433 "step is not a multiple of type size: step ");
2434 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2435 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2436 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2437 TYPE_SIZE_UNIT (scalar_type));
2439 return false;
2442 if (groupsize == 0)
2443 groupsize = count;
2445 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2446 if (dump_enabled_p ())
2447 dump_printf_loc (MSG_NOTE, vect_location,
2448 "Detected interleaving of size %d", (int)groupsize);
2450 /* SLP: create an SLP data structure for every interleaving group of
2451 stores for further analysis in vect_analyse_slp. */
2452 if (DR_IS_WRITE (dr) && !slp_impossible)
2454 if (loop_vinfo)
2455 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2456 if (bb_vinfo)
2457 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2460 /* There is a gap in the end of the group. */
2461 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465 "Data access with gaps requires scalar "
2466 "epilogue loop");
2467 if (loop->inner)
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 "Peeling for outer loop is not supported");
2472 return false;
2475 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2479 return true;
2483 /* Analyze the access pattern of the data-reference DR.
2484 In case of non-consecutive accesses call vect_analyze_group_access() to
2485 analyze groups of accesses. */
2487 static bool
2488 vect_analyze_data_ref_access (struct data_reference *dr)
2490 tree step = DR_STEP (dr);
2491 tree scalar_type = TREE_TYPE (DR_REF (dr));
2492 gimple stmt = DR_STMT (dr);
2493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2495 struct loop *loop = NULL;
2497 if (loop_vinfo)
2498 loop = LOOP_VINFO_LOOP (loop_vinfo);
2500 if (loop_vinfo && !step)
2502 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2504 "bad data-ref access in loop");
2505 return false;
2508 /* Allow invariant loads in not nested loops. */
2509 if (loop_vinfo && integer_zerop (step))
2511 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2512 if (nested_in_vect_loop_p (loop, stmt))
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location,
2516 "zero step in inner loop of nest");
2517 return false;
2519 return DR_IS_READ (dr);
2522 if (loop && nested_in_vect_loop_p (loop, stmt))
2524 /* Interleaved accesses are not yet supported within outer-loop
2525 vectorization for references in the inner-loop. */
2526 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2528 /* For the rest of the analysis we use the outer-loop step. */
2529 step = STMT_VINFO_DR_STEP (stmt_info);
2530 if (integer_zerop (step))
2532 if (dump_enabled_p ())
2533 dump_printf_loc (MSG_NOTE, vect_location,
2534 "zero step in outer loop.");
2535 if (DR_IS_READ (dr))
2536 return true;
2537 else
2538 return false;
2542 /* Consecutive? */
2543 if (TREE_CODE (step) == INTEGER_CST)
2545 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2546 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2547 || (dr_step < 0
2548 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2550 /* Mark that it is not interleaving. */
2551 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2552 return true;
2556 if (loop && nested_in_vect_loop_p (loop, stmt))
2558 if (dump_enabled_p ())
2559 dump_printf_loc (MSG_NOTE, vect_location,
2560 "grouped access in outer loop.");
2561 return false;
2564 /* Assume this is a DR handled by non-constant strided load case. */
2565 if (TREE_CODE (step) != INTEGER_CST)
2566 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2568 /* Not consecutive access - check if it's a part of interleaving group. */
2569 return vect_analyze_group_access (dr);
2573 /* Function vect_analyze_data_ref_accesses.
2575 Analyze the access pattern of all the data references in the loop.
2577 FORNOW: the only access pattern that is considered vectorizable is a
2578 simple step 1 (consecutive) access.
2580 FORNOW: handle only arrays and pointer accesses. */
2582 bool
2583 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2585 unsigned int i;
2586 vec<data_reference_p> datarefs;
2587 struct data_reference *dr;
2589 if (dump_enabled_p ())
2590 dump_printf_loc (MSG_NOTE, vect_location,
2591 "=== vect_analyze_data_ref_accesses ===");
2593 if (loop_vinfo)
2594 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2595 else
2596 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2598 FOR_EACH_VEC_ELT (datarefs, i, dr)
2599 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2600 && !vect_analyze_data_ref_access (dr))
2602 if (dump_enabled_p ())
2603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2604 "not vectorized: complicated access pattern.");
2606 if (bb_vinfo)
2608 /* Mark the statement as not vectorizable. */
2609 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2610 continue;
2612 else
2613 return false;
2616 return true;
2619 /* Function vect_prune_runtime_alias_test_list.
2621 Prune a list of ddrs to be tested at run-time by versioning for alias.
2622 Return FALSE if resulting list of ddrs is longer then allowed by
2623 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2625 bool
2626 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2628 vec<ddr_p> ddrs =
2629 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2630 unsigned i, j;
2632 if (dump_enabled_p ())
2633 dump_printf_loc (MSG_NOTE, vect_location,
2634 "=== vect_prune_runtime_alias_test_list ===");
2636 for (i = 0; i < ddrs.length (); )
2638 bool found;
2639 ddr_p ddr_i;
2641 ddr_i = ddrs[i];
2642 found = false;
2644 for (j = 0; j < i; j++)
2646 ddr_p ddr_j = ddrs[j];
2648 if (vect_vfa_range_equal (ddr_i, ddr_j))
2650 if (dump_enabled_p ())
2652 dump_printf_loc (MSG_NOTE, vect_location,
2653 "found equal ranges ");
2654 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2655 dump_printf (MSG_NOTE, ", ");
2656 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2657 dump_printf (MSG_NOTE, " and ");
2658 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2659 dump_printf (MSG_NOTE, ", ");
2660 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2662 found = true;
2663 break;
2667 if (found)
2669 ddrs.ordered_remove (i);
2670 continue;
2672 i++;
2675 if (ddrs.length () >
2676 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2678 if (dump_enabled_p ())
2680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2681 "disable versioning for alias - max number of "
2682 "generated checks exceeded.");
2685 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2687 return false;
2690 return true;
2693 /* Check whether a non-affine read in stmt is suitable for gather load
2694 and if so, return a builtin decl for that operation. */
2696 tree
2697 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2698 tree *offp, int *scalep)
2700 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2701 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2702 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2703 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2704 tree offtype = NULL_TREE;
2705 tree decl, base, off;
2706 enum machine_mode pmode;
2707 int punsignedp, pvolatilep;
2709 /* The gather builtins need address of the form
2710 loop_invariant + vector * {1, 2, 4, 8}
2712 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2713 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2714 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2715 multiplications and additions in it. To get a vector, we need
2716 a single SSA_NAME that will be defined in the loop and will
2717 contain everything that is not loop invariant and that can be
2718 vectorized. The following code attempts to find such a preexistng
2719 SSA_NAME OFF and put the loop invariants into a tree BASE
2720 that can be gimplified before the loop. */
2721 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2722 &pmode, &punsignedp, &pvolatilep, false);
2723 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2725 if (TREE_CODE (base) == MEM_REF)
2727 if (!integer_zerop (TREE_OPERAND (base, 1)))
2729 if (off == NULL_TREE)
2731 double_int moff = mem_ref_offset (base);
2732 off = double_int_to_tree (sizetype, moff);
2734 else
2735 off = size_binop (PLUS_EXPR, off,
2736 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2738 base = TREE_OPERAND (base, 0);
2740 else
2741 base = build_fold_addr_expr (base);
2743 if (off == NULL_TREE)
2744 off = size_zero_node;
2746 /* If base is not loop invariant, either off is 0, then we start with just
2747 the constant offset in the loop invariant BASE and continue with base
2748 as OFF, otherwise give up.
2749 We could handle that case by gimplifying the addition of base + off
2750 into some SSA_NAME and use that as off, but for now punt. */
2751 if (!expr_invariant_in_loop_p (loop, base))
2753 if (!integer_zerop (off))
2754 return NULL_TREE;
2755 off = base;
2756 base = size_int (pbitpos / BITS_PER_UNIT);
2758 /* Otherwise put base + constant offset into the loop invariant BASE
2759 and continue with OFF. */
2760 else
2762 base = fold_convert (sizetype, base);
2763 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2766 /* OFF at this point may be either a SSA_NAME or some tree expression
2767 from get_inner_reference. Try to peel off loop invariants from it
2768 into BASE as long as possible. */
2769 STRIP_NOPS (off);
2770 while (offtype == NULL_TREE)
2772 enum tree_code code;
2773 tree op0, op1, add = NULL_TREE;
2775 if (TREE_CODE (off) == SSA_NAME)
2777 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2779 if (expr_invariant_in_loop_p (loop, off))
2780 return NULL_TREE;
2782 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2783 break;
2785 op0 = gimple_assign_rhs1 (def_stmt);
2786 code = gimple_assign_rhs_code (def_stmt);
2787 op1 = gimple_assign_rhs2 (def_stmt);
2789 else
2791 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2792 return NULL_TREE;
2793 code = TREE_CODE (off);
2794 extract_ops_from_tree (off, &code, &op0, &op1);
2796 switch (code)
2798 case POINTER_PLUS_EXPR:
2799 case PLUS_EXPR:
2800 if (expr_invariant_in_loop_p (loop, op0))
2802 add = op0;
2803 off = op1;
2804 do_add:
2805 add = fold_convert (sizetype, add);
2806 if (scale != 1)
2807 add = size_binop (MULT_EXPR, add, size_int (scale));
2808 base = size_binop (PLUS_EXPR, base, add);
2809 continue;
2811 if (expr_invariant_in_loop_p (loop, op1))
2813 add = op1;
2814 off = op0;
2815 goto do_add;
2817 break;
2818 case MINUS_EXPR:
2819 if (expr_invariant_in_loop_p (loop, op1))
2821 add = fold_convert (sizetype, op1);
2822 add = size_binop (MINUS_EXPR, size_zero_node, add);
2823 off = op0;
2824 goto do_add;
2826 break;
2827 case MULT_EXPR:
2828 if (scale == 1 && host_integerp (op1, 0))
2830 scale = tree_low_cst (op1, 0);
2831 off = op0;
2832 continue;
2834 break;
2835 case SSA_NAME:
2836 off = op0;
2837 continue;
2838 CASE_CONVERT:
2839 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2840 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2841 break;
2842 if (TYPE_PRECISION (TREE_TYPE (op0))
2843 == TYPE_PRECISION (TREE_TYPE (off)))
2845 off = op0;
2846 continue;
2848 if (TYPE_PRECISION (TREE_TYPE (op0))
2849 < TYPE_PRECISION (TREE_TYPE (off)))
2851 off = op0;
2852 offtype = TREE_TYPE (off);
2853 STRIP_NOPS (off);
2854 continue;
2856 break;
2857 default:
2858 break;
2860 break;
2863 /* If at the end OFF still isn't a SSA_NAME or isn't
2864 defined in the loop, punt. */
2865 if (TREE_CODE (off) != SSA_NAME
2866 || expr_invariant_in_loop_p (loop, off))
2867 return NULL_TREE;
2869 if (offtype == NULL_TREE)
2870 offtype = TREE_TYPE (off);
2872 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2873 offtype, scale);
2874 if (decl == NULL_TREE)
2875 return NULL_TREE;
2877 if (basep)
2878 *basep = base;
2879 if (offp)
2880 *offp = off;
2881 if (scalep)
2882 *scalep = scale;
2883 return decl;
2886 /* Check wether a non-affine load in STMT (being in the loop referred to
2887 in LOOP_VINFO) is suitable for handling as strided load. That is the case
2888 if its address is a simple induction variable. If so return the base
2889 of that induction variable in *BASEP and the (loop-invariant) step
2890 in *STEPP, both only when that pointer is non-zero.
2892 This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant
2893 base pointer) only. */
2895 static bool
2896 vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo)
2898 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2899 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2900 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2901 tree base, off;
2902 affine_iv iv;
2904 if (!DR_IS_READ (dr))
2905 return false;
2907 base = DR_REF (dr);
2909 if (TREE_CODE (base) == ARRAY_REF)
2911 off = TREE_OPERAND (base, 1);
2912 base = TREE_OPERAND (base, 0);
2914 else if (TREE_CODE (base) == MEM_REF)
2916 off = TREE_OPERAND (base, 0);
2917 base = TREE_OPERAND (base, 1);
2919 else
2920 return false;
2922 if (TREE_CODE (off) != SSA_NAME)
2923 return false;
2925 if (!expr_invariant_in_loop_p (loop, base)
2926 || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true))
2927 return false;
2929 return true;
2932 /* Function vect_analyze_data_refs.
2934 Find all the data references in the loop or basic block.
2936 The general structure of the analysis of data refs in the vectorizer is as
2937 follows:
2938 1- vect_analyze_data_refs(loop/bb): call
2939 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2940 in the loop/bb and their dependences.
2941 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2942 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2943 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2947 bool
2948 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2949 bb_vec_info bb_vinfo,
2950 int *min_vf)
2952 struct loop *loop = NULL;
2953 basic_block bb = NULL;
2954 unsigned int i;
2955 vec<data_reference_p> datarefs;
2956 struct data_reference *dr;
2957 tree scalar_type;
2958 bool res, stop_bb_analysis = false;
2960 if (dump_enabled_p ())
2961 dump_printf_loc (MSG_NOTE, vect_location,
2962 "=== vect_analyze_data_refs ===\n");
2964 if (loop_vinfo)
2966 loop = LOOP_VINFO_LOOP (loop_vinfo);
2967 res = compute_data_dependences_for_loop
2968 (loop, true,
2969 &LOOP_VINFO_LOOP_NEST (loop_vinfo),
2970 &LOOP_VINFO_DATAREFS (loop_vinfo),
2971 &LOOP_VINFO_DDRS (loop_vinfo));
2973 if (!res)
2975 if (dump_enabled_p ())
2976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2977 "not vectorized: loop contains function calls"
2978 " or data references that cannot be analyzed");
2979 return false;
2982 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2984 else
2986 gimple_stmt_iterator gsi;
2988 bb = BB_VINFO_BB (bb_vinfo);
2989 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2991 gimple stmt = gsi_stmt (gsi);
2992 if (!find_data_references_in_stmt (NULL, stmt,
2993 &BB_VINFO_DATAREFS (bb_vinfo)))
2995 /* Mark the rest of the basic-block as unvectorizable. */
2996 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2998 stmt = gsi_stmt (gsi);
2999 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3001 break;
3004 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
3005 &BB_VINFO_DDRS (bb_vinfo),
3006 vNULL, true))
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010 "not vectorized: basic block contains function"
3011 " calls or data references that cannot be"
3012 " analyzed");
3013 return false;
3016 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3019 /* Go through the data-refs, check that the analysis succeeded. Update
3020 pointer from stmt_vec_info struct to DR and vectype. */
3022 FOR_EACH_VEC_ELT (datarefs, i, dr)
3024 gimple stmt;
3025 stmt_vec_info stmt_info;
3026 tree base, offset, init;
3027 bool gather = false;
3028 int vf;
3030 if (!dr || !DR_REF (dr))
3032 if (dump_enabled_p ())
3033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3034 "not vectorized: unhandled data-ref ");
3035 return false;
3038 stmt = DR_STMT (dr);
3039 stmt_info = vinfo_for_stmt (stmt);
3041 if (stop_bb_analysis)
3043 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3044 continue;
3047 /* Check that analysis of the data-ref succeeded. */
3048 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3049 || !DR_STEP (dr))
3051 /* If target supports vector gather loads, see if they can't
3052 be used. */
3053 if (loop_vinfo
3054 && DR_IS_READ (dr)
3055 && !TREE_THIS_VOLATILE (DR_REF (dr))
3056 && targetm.vectorize.builtin_gather != NULL
3057 && !nested_in_vect_loop_p (loop, stmt))
3059 struct data_reference *newdr
3060 = create_data_ref (NULL, loop_containing_stmt (stmt),
3061 DR_REF (dr), stmt, true);
3062 gcc_assert (newdr != NULL && DR_REF (newdr));
3063 if (DR_BASE_ADDRESS (newdr)
3064 && DR_OFFSET (newdr)
3065 && DR_INIT (newdr)
3066 && DR_STEP (newdr)
3067 && integer_zerop (DR_STEP (newdr)))
3069 dr = newdr;
3070 gather = true;
3072 else
3073 free_data_ref (newdr);
3076 if (!gather)
3078 if (dump_enabled_p ())
3080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3081 "not vectorized: data ref analysis "
3082 "failed ");
3083 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3086 if (bb_vinfo)
3088 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3089 stop_bb_analysis = true;
3090 continue;
3093 return false;
3097 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3099 if (dump_enabled_p ())
3100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3101 "not vectorized: base addr of dr is a "
3102 "constant");
3104 if (bb_vinfo)
3106 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3107 stop_bb_analysis = true;
3108 continue;
3111 if (gather)
3112 free_data_ref (dr);
3113 return false;
3116 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3118 if (dump_enabled_p ())
3120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3121 "not vectorized: volatile type ");
3122 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3125 if (bb_vinfo)
3127 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3128 stop_bb_analysis = true;
3129 continue;
3132 return false;
3135 if (stmt_can_throw_internal (stmt))
3137 if (dump_enabled_p ())
3139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140 "not vectorized: statement can throw an "
3141 "exception ");
3142 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3145 if (bb_vinfo)
3147 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3148 stop_bb_analysis = true;
3149 continue;
3152 if (gather)
3153 free_data_ref (dr);
3154 return false;
3157 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3158 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3160 if (dump_enabled_p ())
3162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3163 "not vectorized: statement is bitfield "
3164 "access ");
3165 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3168 if (bb_vinfo)
3170 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3171 stop_bb_analysis = true;
3172 continue;
3175 if (gather)
3176 free_data_ref (dr);
3177 return false;
3180 base = unshare_expr (DR_BASE_ADDRESS (dr));
3181 offset = unshare_expr (DR_OFFSET (dr));
3182 init = unshare_expr (DR_INIT (dr));
3184 if (is_gimple_call (stmt))
3186 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189 "not vectorized: dr in a call ");
3190 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3193 if (bb_vinfo)
3195 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3196 stop_bb_analysis = true;
3197 continue;
3200 if (gather)
3201 free_data_ref (dr);
3202 return false;
3205 /* Update DR field in stmt_vec_info struct. */
3207 /* If the dataref is in an inner-loop of the loop that is considered for
3208 for vectorization, we also want to analyze the access relative to
3209 the outer-loop (DR contains information only relative to the
3210 inner-most enclosing loop). We do that by building a reference to the
3211 first location accessed by the inner-loop, and analyze it relative to
3212 the outer-loop. */
3213 if (loop && nested_in_vect_loop_p (loop, stmt))
3215 tree outer_step, outer_base, outer_init;
3216 HOST_WIDE_INT pbitsize, pbitpos;
3217 tree poffset;
3218 enum machine_mode pmode;
3219 int punsignedp, pvolatilep;
3220 affine_iv base_iv, offset_iv;
3221 tree dinit;
3223 /* Build a reference to the first location accessed by the
3224 inner-loop: *(BASE+INIT). (The first location is actually
3225 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3226 tree inner_base = build_fold_indirect_ref
3227 (fold_build_pointer_plus (base, init));
3229 if (dump_enabled_p ())
3231 dump_printf_loc (MSG_NOTE, vect_location,
3232 "analyze in outer-loop: ");
3233 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3236 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3237 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3238 gcc_assert (outer_base != NULL_TREE);
3240 if (pbitpos % BITS_PER_UNIT != 0)
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3244 "failed: bit offset alignment.\n");
3245 return false;
3248 outer_base = build_fold_addr_expr (outer_base);
3249 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3250 &base_iv, false))
3252 if (dump_enabled_p ())
3253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3254 "failed: evolution of base is not affine.\n");
3255 return false;
3258 if (offset)
3260 if (poffset)
3261 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3262 poffset);
3263 else
3264 poffset = offset;
3267 if (!poffset)
3269 offset_iv.base = ssize_int (0);
3270 offset_iv.step = ssize_int (0);
3272 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3273 &offset_iv, false))
3275 if (dump_enabled_p ())
3276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3277 "evolution of offset is not affine.\n");
3278 return false;
3281 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3282 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3283 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3284 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3285 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3287 outer_step = size_binop (PLUS_EXPR,
3288 fold_convert (ssizetype, base_iv.step),
3289 fold_convert (ssizetype, offset_iv.step));
3291 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3292 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3293 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3294 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3295 STMT_VINFO_DR_OFFSET (stmt_info) =
3296 fold_convert (ssizetype, offset_iv.base);
3297 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3298 size_int (highest_pow2_factor (offset_iv.base));
3300 if (dump_enabled_p ())
3302 dump_printf_loc (MSG_NOTE, vect_location,
3303 "\touter base_address: ");
3304 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3305 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3306 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3307 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3308 STMT_VINFO_DR_OFFSET (stmt_info));
3309 dump_printf (MSG_NOTE,
3310 "\n\touter constant offset from base address: ");
3311 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3312 STMT_VINFO_DR_INIT (stmt_info));
3313 dump_printf (MSG_NOTE, "\n\touter step: ");
3314 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3315 STMT_VINFO_DR_STEP (stmt_info));
3316 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3317 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3318 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3322 if (STMT_VINFO_DATA_REF (stmt_info))
3324 if (dump_enabled_p ())
3326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3327 "not vectorized: more than one data ref "
3328 "in stmt: ");
3329 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3332 if (bb_vinfo)
3334 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3335 stop_bb_analysis = true;
3336 continue;
3339 if (gather)
3340 free_data_ref (dr);
3341 return false;
3344 STMT_VINFO_DATA_REF (stmt_info) = dr;
3346 /* Set vectype for STMT. */
3347 scalar_type = TREE_TYPE (DR_REF (dr));
3348 STMT_VINFO_VECTYPE (stmt_info) =
3349 get_vectype_for_scalar_type (scalar_type);
3350 if (!STMT_VINFO_VECTYPE (stmt_info))
3352 if (dump_enabled_p ())
3354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3355 "not vectorized: no vectype for stmt: ");
3356 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3357 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3358 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3359 scalar_type);
3362 if (bb_vinfo)
3364 /* Mark the statement as not vectorizable. */
3365 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3366 stop_bb_analysis = true;
3367 continue;
3370 if (gather)
3372 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3373 free_data_ref (dr);
3375 return false;
3378 /* Adjust the minimal vectorization factor according to the
3379 vector type. */
3380 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3381 if (vf > *min_vf)
3382 *min_vf = vf;
3384 if (gather)
3386 unsigned int j, k, n;
3387 struct data_reference *olddr
3388 = datarefs[i];
3389 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
3390 struct data_dependence_relation *ddr, *newddr;
3391 bool bad = false;
3392 tree off;
3393 vec<loop_p> nest = LOOP_VINFO_LOOP_NEST (loop_vinfo);
3395 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3396 if (gather
3397 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3398 gather = false;
3399 if (!gather)
3401 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3402 free_data_ref (dr);
3403 if (dump_enabled_p ())
3405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3406 "not vectorized: not suitable for gather "
3407 "load ");
3408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3410 return false;
3413 n = datarefs.length () - 1;
3414 for (j = 0, k = i - 1; j < i; j++)
3416 ddr = ddrs[k];
3417 gcc_assert (DDR_B (ddr) == olddr);
3418 newddr = initialize_data_dependence_relation (DDR_A (ddr), dr,
3419 nest);
3420 ddrs[k] = newddr;
3421 free_dependence_relation (ddr);
3422 if (!bad
3423 && DR_IS_WRITE (DDR_A (newddr))
3424 && DDR_ARE_DEPENDENT (newddr) != chrec_known)
3425 bad = true;
3426 k += --n;
3429 k++;
3430 n = k + datarefs.length () - i - 1;
3431 for (; k < n; k++)
3433 ddr = ddrs[k];
3434 gcc_assert (DDR_A (ddr) == olddr);
3435 newddr = initialize_data_dependence_relation (dr, DDR_B (ddr),
3436 nest);
3437 ddrs[k] = newddr;
3438 free_dependence_relation (ddr);
3439 if (!bad
3440 && DR_IS_WRITE (DDR_B (newddr))
3441 && DDR_ARE_DEPENDENT (newddr) != chrec_known)
3442 bad = true;
3445 k = ddrs.length ()
3446 - datarefs.length () + i;
3447 ddr = ddrs[k];
3448 gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr);
3449 newddr = initialize_data_dependence_relation (dr, dr, nest);
3450 ddrs[k] = newddr;
3451 free_dependence_relation (ddr);
3452 datarefs[i] = dr;
3454 if (bad)
3456 if (dump_enabled_p ())
3458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3459 "not vectorized: data dependence conflict"
3460 " prevents gather load");
3461 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3463 return false;
3466 STMT_VINFO_GATHER_P (stmt_info) = true;
3468 else if (loop_vinfo
3469 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3471 bool strided_load = false;
3472 if (!nested_in_vect_loop_p (loop, stmt))
3473 strided_load = vect_check_strided_load (stmt, loop_vinfo);
3474 if (!strided_load)
3476 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3479 "not vectorized: not suitable for strided "
3480 "load ");
3481 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3483 return false;
3485 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3489 return true;
3493 /* Function vect_get_new_vect_var.
3495 Returns a name for a new variable. The current naming scheme appends the
3496 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3497 the name of vectorizer generated variables, and appends that to NAME if
3498 provided. */
3500 tree
3501 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3503 const char *prefix;
3504 tree new_vect_var;
3506 switch (var_kind)
3508 case vect_simple_var:
3509 prefix = "vect_";
3510 break;
3511 case vect_scalar_var:
3512 prefix = "stmp_";
3513 break;
3514 case vect_pointer_var:
3515 prefix = "vect_p";
3516 break;
3517 default:
3518 gcc_unreachable ();
3521 if (name)
3523 char* tmp = concat (prefix, name, NULL);
3524 new_vect_var = create_tmp_reg (type, tmp);
3525 free (tmp);
3527 else
3528 new_vect_var = create_tmp_reg (type, prefix);
3530 return new_vect_var;
3534 /* Function vect_create_addr_base_for_vector_ref.
3536 Create an expression that computes the address of the first memory location
3537 that will be accessed for a data reference.
3539 Input:
3540 STMT: The statement containing the data reference.
3541 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3542 OFFSET: Optional. If supplied, it is be added to the initial address.
3543 LOOP: Specify relative to which loop-nest should the address be computed.
3544 For example, when the dataref is in an inner-loop nested in an
3545 outer-loop that is now being vectorized, LOOP can be either the
3546 outer-loop, or the inner-loop. The first memory location accessed
3547 by the following dataref ('in' points to short):
3549 for (i=0; i<N; i++)
3550 for (j=0; j<M; j++)
3551 s += in[i+j]
3553 is as follows:
3554 if LOOP=i_loop: &in (relative to i_loop)
3555 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3556 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3557 initial address. Unlike OFFSET, which is number of elements to
3558 be added, BYTE_OFFSET is measured in bytes.
3560 Output:
3561 1. Return an SSA_NAME whose value is the address of the memory location of
3562 the first vector of the data reference.
3563 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3564 these statement(s) which define the returned SSA_NAME.
3566 FORNOW: We are only handling array accesses with step 1. */
3568 tree
3569 vect_create_addr_base_for_vector_ref (gimple stmt,
3570 gimple_seq *new_stmt_list,
3571 tree offset,
3572 struct loop *loop,
3573 tree byte_offset)
3575 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3576 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3577 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3578 const char *base_name;
3579 tree data_ref_base_var;
3580 tree vec_stmt;
3581 tree addr_base, addr_expr;
3582 tree dest;
3583 gimple_seq seq = NULL;
3584 tree base_offset = unshare_expr (DR_OFFSET (dr));
3585 tree init = unshare_expr (DR_INIT (dr));
3586 tree vect_ptr_type;
3587 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3588 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3589 tree base;
3591 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3593 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3595 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3597 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3598 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3599 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3602 if (loop_vinfo)
3603 base_name = get_name (data_ref_base);
3604 else
3606 base_offset = ssize_int (0);
3607 init = ssize_int (0);
3608 base_name = get_name (DR_REF (dr));
3611 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
3612 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
3613 data_ref_base_var);
3614 gimple_seq_add_seq (new_stmt_list, seq);
3616 /* Create base_offset */
3617 base_offset = size_binop (PLUS_EXPR,
3618 fold_convert (sizetype, base_offset),
3619 fold_convert (sizetype, init));
3620 dest = create_tmp_var (sizetype, "base_off");
3621 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
3622 gimple_seq_add_seq (new_stmt_list, seq);
3624 if (offset)
3626 tree tmp = create_tmp_var (sizetype, "offset");
3628 offset = fold_build2 (MULT_EXPR, sizetype,
3629 fold_convert (sizetype, offset), step);
3630 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3631 base_offset, offset);
3632 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
3633 gimple_seq_add_seq (new_stmt_list, seq);
3635 if (byte_offset)
3637 tree tmp = create_tmp_var (sizetype, "offset");
3639 byte_offset = fold_convert (sizetype, byte_offset);
3640 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3641 base_offset, byte_offset);
3642 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
3643 gimple_seq_add_seq (new_stmt_list, seq);
3646 /* base + base_offset */
3647 if (loop_vinfo)
3648 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3649 else
3651 addr_base = build1 (ADDR_EXPR,
3652 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3653 unshare_expr (DR_REF (dr)));
3656 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3657 base = get_base_address (DR_REF (dr));
3658 if (base
3659 && TREE_CODE (base) == MEM_REF)
3660 vect_ptr_type
3661 = build_qualified_type (vect_ptr_type,
3662 TYPE_QUALS (TREE_TYPE (TREE_OPERAND (base, 0))));
3664 vec_stmt = fold_convert (vect_ptr_type, addr_base);
3665 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
3666 base_name);
3667 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr);
3668 gimple_seq_add_seq (new_stmt_list, seq);
3670 if (DR_PTR_INFO (dr)
3671 && TREE_CODE (vec_stmt) == SSA_NAME)
3673 duplicate_ssa_name_ptr_info (vec_stmt, DR_PTR_INFO (dr));
3674 if (offset)
3675 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (vec_stmt));
3678 if (dump_enabled_p ())
3680 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3681 dump_generic_expr (MSG_NOTE, TDF_SLIM, vec_stmt);
3684 return vec_stmt;
3688 /* Function vect_create_data_ref_ptr.
3690 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3691 location accessed in the loop by STMT, along with the def-use update
3692 chain to appropriately advance the pointer through the loop iterations.
3693 Also set aliasing information for the pointer. This pointer is used by
3694 the callers to this function to create a memory reference expression for
3695 vector load/store access.
3697 Input:
3698 1. STMT: a stmt that references memory. Expected to be of the form
3699 GIMPLE_ASSIGN <name, data-ref> or
3700 GIMPLE_ASSIGN <data-ref, name>.
3701 2. AGGR_TYPE: the type of the reference, which should be either a vector
3702 or an array.
3703 3. AT_LOOP: the loop where the vector memref is to be created.
3704 4. OFFSET (optional): an offset to be added to the initial address accessed
3705 by the data-ref in STMT.
3706 5. BSI: location where the new stmts are to be placed if there is no loop
3707 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3708 pointing to the initial address.
3709 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3710 to the initial address accessed by the data-ref in STMT. This is
3711 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3712 in bytes.
3714 Output:
3715 1. Declare a new ptr to vector_type, and have it point to the base of the
3716 data reference (initial addressed accessed by the data reference).
3717 For example, for vector of type V8HI, the following code is generated:
3719 v8hi *ap;
3720 ap = (v8hi *)initial_address;
3722 if OFFSET is not supplied:
3723 initial_address = &a[init];
3724 if OFFSET is supplied:
3725 initial_address = &a[init + OFFSET];
3726 if BYTE_OFFSET is supplied:
3727 initial_address = &a[init] + BYTE_OFFSET;
3729 Return the initial_address in INITIAL_ADDRESS.
3731 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3732 update the pointer in each iteration of the loop.
3734 Return the increment stmt that updates the pointer in PTR_INCR.
3736 3. Set INV_P to true if the access pattern of the data reference in the
3737 vectorized loop is invariant. Set it to false otherwise.
3739 4. Return the pointer. */
3741 tree
3742 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3743 tree offset, tree *initial_address,
3744 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3745 bool only_init, bool *inv_p, tree byte_offset)
3747 const char *base_name;
3748 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3749 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3750 struct loop *loop = NULL;
3751 bool nested_in_vect_loop = false;
3752 struct loop *containing_loop = NULL;
3753 tree aggr_ptr_type;
3754 tree aggr_ptr;
3755 tree new_temp;
3756 gimple vec_stmt;
3757 gimple_seq new_stmt_list = NULL;
3758 edge pe = NULL;
3759 basic_block new_bb;
3760 tree aggr_ptr_init;
3761 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3762 tree aptr;
3763 gimple_stmt_iterator incr_gsi;
3764 bool insert_after;
3765 bool negative;
3766 tree indx_before_incr, indx_after_incr;
3767 gimple incr;
3768 tree step;
3769 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3770 tree base;
3772 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3773 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3775 if (loop_vinfo)
3777 loop = LOOP_VINFO_LOOP (loop_vinfo);
3778 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3779 containing_loop = (gimple_bb (stmt))->loop_father;
3780 pe = loop_preheader_edge (loop);
3782 else
3784 gcc_assert (bb_vinfo);
3785 only_init = true;
3786 *ptr_incr = NULL;
3789 /* Check the step (evolution) of the load in LOOP, and record
3790 whether it's invariant. */
3791 if (nested_in_vect_loop)
3792 step = STMT_VINFO_DR_STEP (stmt_info);
3793 else
3794 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3796 if (tree_int_cst_compare (step, size_zero_node) == 0)
3797 *inv_p = true;
3798 else
3799 *inv_p = false;
3800 negative = tree_int_cst_compare (step, size_zero_node) < 0;
3802 /* Create an expression for the first address accessed by this load
3803 in LOOP. */
3804 base_name = get_name (DR_BASE_ADDRESS (dr));
3806 if (dump_enabled_p ())
3808 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3809 dump_printf_loc (MSG_NOTE, vect_location,
3810 "create %s-pointer variable to type: ",
3811 tree_code_name[(int) TREE_CODE (aggr_type)]);
3812 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3813 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3814 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3815 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3816 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3817 else
3818 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3819 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3822 /* (1) Create the new aggregate-pointer variable. */
3823 aggr_ptr_type = build_pointer_type (aggr_type);
3824 base = get_base_address (DR_REF (dr));
3825 if (base
3826 && TREE_CODE (base) == MEM_REF)
3827 aggr_ptr_type
3828 = build_qualified_type (aggr_ptr_type,
3829 TYPE_QUALS (TREE_TYPE (TREE_OPERAND (base, 0))));
3830 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3832 /* Vector and array types inherit the alias set of their component
3833 type by default so we need to use a ref-all pointer if the data
3834 reference does not conflict with the created aggregated data
3835 reference because it is not addressable. */
3836 if (!alias_sets_conflict_p (get_deref_alias_set (aggr_ptr),
3837 get_alias_set (DR_REF (dr))))
3839 aggr_ptr_type
3840 = build_pointer_type_for_mode (aggr_type,
3841 TYPE_MODE (aggr_ptr_type), true);
3842 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var,
3843 base_name);
3846 /* Likewise for any of the data references in the stmt group. */
3847 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3849 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3852 tree lhs = gimple_assign_lhs (orig_stmt);
3853 if (!alias_sets_conflict_p (get_deref_alias_set (aggr_ptr),
3854 get_alias_set (lhs)))
3856 aggr_ptr_type
3857 = build_pointer_type_for_mode (aggr_type,
3858 TYPE_MODE (aggr_ptr_type), true);
3859 aggr_ptr
3860 = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var,
3861 base_name);
3862 break;
3865 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo_for_stmt (orig_stmt));
3867 while (orig_stmt);
3870 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3871 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3872 def-use update cycles for the pointer: one relative to the outer-loop
3873 (LOOP), which is what steps (3) and (4) below do. The other is relative
3874 to the inner-loop (which is the inner-most loop containing the dataref),
3875 and this is done be step (5) below.
3877 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3878 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3879 redundant. Steps (3),(4) create the following:
3881 vp0 = &base_addr;
3882 LOOP: vp1 = phi(vp0,vp2)
3885 vp2 = vp1 + step
3886 goto LOOP
3888 If there is an inner-loop nested in loop, then step (5) will also be
3889 applied, and an additional update in the inner-loop will be created:
3891 vp0 = &base_addr;
3892 LOOP: vp1 = phi(vp0,vp2)
3894 inner: vp3 = phi(vp1,vp4)
3895 vp4 = vp3 + inner_step
3896 if () goto inner
3898 vp2 = vp1 + step
3899 if () goto LOOP */
3901 /* (2) Calculate the initial address of the aggregate-pointer, and set
3902 the aggregate-pointer to point to it before the loop. */
3904 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
3906 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3907 offset, loop, byte_offset);
3908 if (new_stmt_list)
3910 if (pe)
3912 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3913 gcc_assert (!new_bb);
3915 else
3916 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3919 *initial_address = new_temp;
3921 /* Create: p = (aggr_type *) initial_base */
3922 if (TREE_CODE (new_temp) != SSA_NAME
3923 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3925 vec_stmt = gimple_build_assign (aggr_ptr,
3926 fold_convert (aggr_ptr_type, new_temp));
3927 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3928 /* Copy the points-to information if it exists. */
3929 if (DR_PTR_INFO (dr))
3930 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3931 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3932 if (pe)
3934 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3935 gcc_assert (!new_bb);
3937 else
3938 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3940 else
3941 aggr_ptr_init = new_temp;
3943 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3944 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3945 inner-loop nested in LOOP (during outer-loop vectorization). */
3947 /* No update in loop is required. */
3948 if (only_init && (!loop_vinfo || at_loop == loop))
3949 aptr = aggr_ptr_init;
3950 else
3952 /* The step of the aggregate pointer is the type size. */
3953 tree step = TYPE_SIZE_UNIT (aggr_type);
3954 /* One exception to the above is when the scalar step of the load in
3955 LOOP is zero. In this case the step here is also zero. */
3956 if (*inv_p)
3957 step = size_zero_node;
3958 else if (negative)
3959 step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
3961 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3963 create_iv (aggr_ptr_init,
3964 fold_convert (aggr_ptr_type, step),
3965 aggr_ptr, loop, &incr_gsi, insert_after,
3966 &indx_before_incr, &indx_after_incr);
3967 incr = gsi_stmt (incr_gsi);
3968 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3970 /* Copy the points-to information if it exists. */
3971 if (DR_PTR_INFO (dr))
3973 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3974 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3976 if (ptr_incr)
3977 *ptr_incr = incr;
3979 aptr = indx_before_incr;
3982 if (!nested_in_vect_loop || only_init)
3983 return aptr;
3986 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3987 nested in LOOP, if exists. */
3989 gcc_assert (nested_in_vect_loop);
3990 if (!only_init)
3992 standard_iv_increment_position (containing_loop, &incr_gsi,
3993 &insert_after);
3994 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3995 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3996 &indx_after_incr);
3997 incr = gsi_stmt (incr_gsi);
3998 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4000 /* Copy the points-to information if it exists. */
4001 if (DR_PTR_INFO (dr))
4003 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4004 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4006 if (ptr_incr)
4007 *ptr_incr = incr;
4009 return indx_before_incr;
4011 else
4012 gcc_unreachable ();
4016 /* Function bump_vector_ptr
4018 Increment a pointer (to a vector type) by vector-size. If requested,
4019 i.e. if PTR-INCR is given, then also connect the new increment stmt
4020 to the existing def-use update-chain of the pointer, by modifying
4021 the PTR_INCR as illustrated below:
4023 The pointer def-use update-chain before this function:
4024 DATAREF_PTR = phi (p_0, p_2)
4025 ....
4026 PTR_INCR: p_2 = DATAREF_PTR + step
4028 The pointer def-use update-chain after this function:
4029 DATAREF_PTR = phi (p_0, p_2)
4030 ....
4031 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4032 ....
4033 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4035 Input:
4036 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4037 in the loop.
4038 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4039 the loop. The increment amount across iterations is expected
4040 to be vector_size.
4041 BSI - location where the new update stmt is to be placed.
4042 STMT - the original scalar memory-access stmt that is being vectorized.
4043 BUMP - optional. The offset by which to bump the pointer. If not given,
4044 the offset is assumed to be vector_size.
4046 Output: Return NEW_DATAREF_PTR as illustrated above.
4050 tree
4051 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4052 gimple stmt, tree bump)
4054 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4055 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4056 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4057 tree update = TYPE_SIZE_UNIT (vectype);
4058 gimple incr_stmt;
4059 ssa_op_iter iter;
4060 use_operand_p use_p;
4061 tree new_dataref_ptr;
4063 if (bump)
4064 update = bump;
4066 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4067 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4068 dataref_ptr, update);
4069 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4071 /* Copy the points-to information if it exists. */
4072 if (DR_PTR_INFO (dr))
4074 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4075 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4078 if (!ptr_incr)
4079 return new_dataref_ptr;
4081 /* Update the vector-pointer's cross-iteration increment. */
4082 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4084 tree use = USE_FROM_PTR (use_p);
4086 if (use == dataref_ptr)
4087 SET_USE (use_p, new_dataref_ptr);
4088 else
4089 gcc_assert (tree_int_cst_compare (use, update) == 0);
4092 return new_dataref_ptr;
4096 /* Function vect_create_destination_var.
4098 Create a new temporary of type VECTYPE. */
4100 tree
4101 vect_create_destination_var (tree scalar_dest, tree vectype)
4103 tree vec_dest;
4104 const char *new_name;
4105 tree type;
4106 enum vect_var_kind kind;
4108 kind = vectype ? vect_simple_var : vect_scalar_var;
4109 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4111 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4113 new_name = get_name (scalar_dest);
4114 if (!new_name)
4115 new_name = "var_";
4116 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4118 return vec_dest;
4121 /* Function vect_grouped_store_supported.
4123 Returns TRUE if interleave high and interleave low permutations
4124 are supported, and FALSE otherwise. */
4126 bool
4127 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4129 enum machine_mode mode = TYPE_MODE (vectype);
4131 /* vect_permute_store_chain requires the group size to be a power of two. */
4132 if (exact_log2 (count) == -1)
4134 if (dump_enabled_p ())
4135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4136 "the size of the group of accesses"
4137 " is not a power of 2");
4138 return false;
4141 /* Check that the permutation is supported. */
4142 if (VECTOR_MODE_P (mode))
4144 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4145 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4146 for (i = 0; i < nelt / 2; i++)
4148 sel[i * 2] = i;
4149 sel[i * 2 + 1] = i + nelt;
4151 if (can_vec_perm_p (mode, false, sel))
4153 for (i = 0; i < nelt; i++)
4154 sel[i] += nelt / 2;
4155 if (can_vec_perm_p (mode, false, sel))
4156 return true;
4160 if (dump_enabled_p ())
4161 dump_printf (MSG_MISSED_OPTIMIZATION,
4162 "interleave op not supported by target.");
4163 return false;
4167 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4168 type VECTYPE. */
4170 bool
4171 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4173 return vect_lanes_optab_supported_p ("vec_store_lanes",
4174 vec_store_lanes_optab,
4175 vectype, count);
4179 /* Function vect_permute_store_chain.
4181 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4182 a power of 2, generate interleave_high/low stmts to reorder the data
4183 correctly for the stores. Return the final references for stores in
4184 RESULT_CHAIN.
4186 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4187 The input is 4 vectors each containing 8 elements. We assign a number to
4188 each element, the input sequence is:
4190 1st vec: 0 1 2 3 4 5 6 7
4191 2nd vec: 8 9 10 11 12 13 14 15
4192 3rd vec: 16 17 18 19 20 21 22 23
4193 4th vec: 24 25 26 27 28 29 30 31
4195 The output sequence should be:
4197 1st vec: 0 8 16 24 1 9 17 25
4198 2nd vec: 2 10 18 26 3 11 19 27
4199 3rd vec: 4 12 20 28 5 13 21 30
4200 4th vec: 6 14 22 30 7 15 23 31
4202 i.e., we interleave the contents of the four vectors in their order.
4204 We use interleave_high/low instructions to create such output. The input of
4205 each interleave_high/low operation is two vectors:
4206 1st vec 2nd vec
4207 0 1 2 3 4 5 6 7
4208 the even elements of the result vector are obtained left-to-right from the
4209 high/low elements of the first vector. The odd elements of the result are
4210 obtained left-to-right from the high/low elements of the second vector.
4211 The output of interleave_high will be: 0 4 1 5
4212 and of interleave_low: 2 6 3 7
4215 The permutation is done in log LENGTH stages. In each stage interleave_high
4216 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4217 where the first argument is taken from the first half of DR_CHAIN and the
4218 second argument from it's second half.
4219 In our example,
4221 I1: interleave_high (1st vec, 3rd vec)
4222 I2: interleave_low (1st vec, 3rd vec)
4223 I3: interleave_high (2nd vec, 4th vec)
4224 I4: interleave_low (2nd vec, 4th vec)
4226 The output for the first stage is:
4228 I1: 0 16 1 17 2 18 3 19
4229 I2: 4 20 5 21 6 22 7 23
4230 I3: 8 24 9 25 10 26 11 27
4231 I4: 12 28 13 29 14 30 15 31
4233 The output of the second stage, i.e. the final result is:
4235 I1: 0 8 16 24 1 9 17 25
4236 I2: 2 10 18 26 3 11 19 27
4237 I3: 4 12 20 28 5 13 21 30
4238 I4: 6 14 22 30 7 15 23 31. */
4240 void
4241 vect_permute_store_chain (vec<tree> dr_chain,
4242 unsigned int length,
4243 gimple stmt,
4244 gimple_stmt_iterator *gsi,
4245 vec<tree> *result_chain)
4247 tree vect1, vect2, high, low;
4248 gimple perm_stmt;
4249 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4250 tree perm_mask_low, perm_mask_high;
4251 unsigned int i, n;
4252 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4253 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4255 result_chain->quick_grow (length);
4256 memcpy (result_chain->address (), dr_chain.address (),
4257 length * sizeof (tree));
4259 for (i = 0, n = nelt / 2; i < n; i++)
4261 sel[i * 2] = i;
4262 sel[i * 2 + 1] = i + nelt;
4264 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4265 gcc_assert (perm_mask_high != NULL);
4267 for (i = 0; i < nelt; i++)
4268 sel[i] += nelt / 2;
4269 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4270 gcc_assert (perm_mask_low != NULL);
4272 for (i = 0, n = exact_log2 (length); i < n; i++)
4274 for (j = 0; j < length/2; j++)
4276 vect1 = dr_chain[j];
4277 vect2 = dr_chain[j+length/2];
4279 /* Create interleaving stmt:
4280 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4281 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4282 perm_stmt
4283 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4284 vect1, vect2, perm_mask_high);
4285 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4286 (*result_chain)[2*j] = high;
4288 /* Create interleaving stmt:
4289 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4290 nelt*3/2+1, ...}> */
4291 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4292 perm_stmt
4293 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4294 vect1, vect2, perm_mask_low);
4295 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4296 (*result_chain)[2*j+1] = low;
4298 memcpy (dr_chain.address (), result_chain->address (),
4299 length * sizeof (tree));
4303 /* Function vect_setup_realignment
4305 This function is called when vectorizing an unaligned load using
4306 the dr_explicit_realign[_optimized] scheme.
4307 This function generates the following code at the loop prolog:
4309 p = initial_addr;
4310 x msq_init = *(floor(p)); # prolog load
4311 realignment_token = call target_builtin;
4312 loop:
4313 x msq = phi (msq_init, ---)
4315 The stmts marked with x are generated only for the case of
4316 dr_explicit_realign_optimized.
4318 The code above sets up a new (vector) pointer, pointing to the first
4319 location accessed by STMT, and a "floor-aligned" load using that pointer.
4320 It also generates code to compute the "realignment-token" (if the relevant
4321 target hook was defined), and creates a phi-node at the loop-header bb
4322 whose arguments are the result of the prolog-load (created by this
4323 function) and the result of a load that takes place in the loop (to be
4324 created by the caller to this function).
4326 For the case of dr_explicit_realign_optimized:
4327 The caller to this function uses the phi-result (msq) to create the
4328 realignment code inside the loop, and sets up the missing phi argument,
4329 as follows:
4330 loop:
4331 msq = phi (msq_init, lsq)
4332 lsq = *(floor(p')); # load in loop
4333 result = realign_load (msq, lsq, realignment_token);
4335 For the case of dr_explicit_realign:
4336 loop:
4337 msq = *(floor(p)); # load in loop
4338 p' = p + (VS-1);
4339 lsq = *(floor(p')); # load in loop
4340 result = realign_load (msq, lsq, realignment_token);
4342 Input:
4343 STMT - (scalar) load stmt to be vectorized. This load accesses
4344 a memory location that may be unaligned.
4345 BSI - place where new code is to be inserted.
4346 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4347 is used.
4349 Output:
4350 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4351 target hook, if defined.
4352 Return value - the result of the loop-header phi node. */
4354 tree
4355 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4356 tree *realignment_token,
4357 enum dr_alignment_support alignment_support_scheme,
4358 tree init_addr,
4359 struct loop **at_loop)
4361 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4362 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4363 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4364 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4365 struct loop *loop = NULL;
4366 edge pe = NULL;
4367 tree scalar_dest = gimple_assign_lhs (stmt);
4368 tree vec_dest;
4369 gimple inc;
4370 tree ptr;
4371 tree data_ref;
4372 gimple new_stmt;
4373 basic_block new_bb;
4374 tree msq_init = NULL_TREE;
4375 tree new_temp;
4376 gimple phi_stmt;
4377 tree msq = NULL_TREE;
4378 gimple_seq stmts = NULL;
4379 bool inv_p;
4380 bool compute_in_loop = false;
4381 bool nested_in_vect_loop = false;
4382 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4383 struct loop *loop_for_initial_load = NULL;
4385 if (loop_vinfo)
4387 loop = LOOP_VINFO_LOOP (loop_vinfo);
4388 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4391 gcc_assert (alignment_support_scheme == dr_explicit_realign
4392 || alignment_support_scheme == dr_explicit_realign_optimized);
4394 /* We need to generate three things:
4395 1. the misalignment computation
4396 2. the extra vector load (for the optimized realignment scheme).
4397 3. the phi node for the two vectors from which the realignment is
4398 done (for the optimized realignment scheme). */
4400 /* 1. Determine where to generate the misalignment computation.
4402 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4403 calculation will be generated by this function, outside the loop (in the
4404 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4405 caller, inside the loop.
4407 Background: If the misalignment remains fixed throughout the iterations of
4408 the loop, then both realignment schemes are applicable, and also the
4409 misalignment computation can be done outside LOOP. This is because we are
4410 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4411 are a multiple of VS (the Vector Size), and therefore the misalignment in
4412 different vectorized LOOP iterations is always the same.
4413 The problem arises only if the memory access is in an inner-loop nested
4414 inside LOOP, which is now being vectorized using outer-loop vectorization.
4415 This is the only case when the misalignment of the memory access may not
4416 remain fixed throughout the iterations of the inner-loop (as explained in
4417 detail in vect_supportable_dr_alignment). In this case, not only is the
4418 optimized realignment scheme not applicable, but also the misalignment
4419 computation (and generation of the realignment token that is passed to
4420 REALIGN_LOAD) have to be done inside the loop.
4422 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4423 or not, which in turn determines if the misalignment is computed inside
4424 the inner-loop, or outside LOOP. */
4426 if (init_addr != NULL_TREE || !loop_vinfo)
4428 compute_in_loop = true;
4429 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4433 /* 2. Determine where to generate the extra vector load.
4435 For the optimized realignment scheme, instead of generating two vector
4436 loads in each iteration, we generate a single extra vector load in the
4437 preheader of the loop, and in each iteration reuse the result of the
4438 vector load from the previous iteration. In case the memory access is in
4439 an inner-loop nested inside LOOP, which is now being vectorized using
4440 outer-loop vectorization, we need to determine whether this initial vector
4441 load should be generated at the preheader of the inner-loop, or can be
4442 generated at the preheader of LOOP. If the memory access has no evolution
4443 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4444 to be generated inside LOOP (in the preheader of the inner-loop). */
4446 if (nested_in_vect_loop)
4448 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4449 bool invariant_in_outerloop =
4450 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4451 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4453 else
4454 loop_for_initial_load = loop;
4455 if (at_loop)
4456 *at_loop = loop_for_initial_load;
4458 if (loop_for_initial_load)
4459 pe = loop_preheader_edge (loop_for_initial_load);
4461 /* 3. For the case of the optimized realignment, create the first vector
4462 load at the loop preheader. */
4464 if (alignment_support_scheme == dr_explicit_realign_optimized)
4466 /* Create msq_init = *(floor(p1)) in the loop preheader */
4468 gcc_assert (!compute_in_loop);
4469 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4470 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4471 NULL_TREE, &init_addr, NULL, &inc,
4472 true, &inv_p);
4473 new_temp = copy_ssa_name (ptr, NULL);
4474 new_stmt = gimple_build_assign_with_ops
4475 (BIT_AND_EXPR, new_temp, ptr,
4476 build_int_cst (TREE_TYPE (ptr),
4477 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4478 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4479 gcc_assert (!new_bb);
4480 data_ref
4481 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4482 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4483 new_stmt = gimple_build_assign (vec_dest, data_ref);
4484 new_temp = make_ssa_name (vec_dest, new_stmt);
4485 gimple_assign_set_lhs (new_stmt, new_temp);
4486 if (pe)
4488 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4489 gcc_assert (!new_bb);
4491 else
4492 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4494 msq_init = gimple_assign_lhs (new_stmt);
4497 /* 4. Create realignment token using a target builtin, if available.
4498 It is done either inside the containing loop, or before LOOP (as
4499 determined above). */
4501 if (targetm.vectorize.builtin_mask_for_load)
4503 tree builtin_decl;
4505 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4506 if (!init_addr)
4508 /* Generate the INIT_ADDR computation outside LOOP. */
4509 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4510 NULL_TREE, loop);
4511 if (loop)
4513 pe = loop_preheader_edge (loop);
4514 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4515 gcc_assert (!new_bb);
4517 else
4518 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4521 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4522 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4523 vec_dest =
4524 vect_create_destination_var (scalar_dest,
4525 gimple_call_return_type (new_stmt));
4526 new_temp = make_ssa_name (vec_dest, new_stmt);
4527 gimple_call_set_lhs (new_stmt, new_temp);
4529 if (compute_in_loop)
4530 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4531 else
4533 /* Generate the misalignment computation outside LOOP. */
4534 pe = loop_preheader_edge (loop);
4535 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4536 gcc_assert (!new_bb);
4539 *realignment_token = gimple_call_lhs (new_stmt);
4541 /* The result of the CALL_EXPR to this builtin is determined from
4542 the value of the parameter and no global variables are touched
4543 which makes the builtin a "const" function. Requiring the
4544 builtin to have the "const" attribute makes it unnecessary
4545 to call mark_call_clobbered. */
4546 gcc_assert (TREE_READONLY (builtin_decl));
4549 if (alignment_support_scheme == dr_explicit_realign)
4550 return msq;
4552 gcc_assert (!compute_in_loop);
4553 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4556 /* 5. Create msq = phi <msq_init, lsq> in loop */
4558 pe = loop_preheader_edge (containing_loop);
4559 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4560 msq = make_ssa_name (vec_dest, NULL);
4561 phi_stmt = create_phi_node (msq, containing_loop->header);
4562 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4564 return msq;
4568 /* Function vect_grouped_load_supported.
4570 Returns TRUE if even and odd permutations are supported,
4571 and FALSE otherwise. */
4573 bool
4574 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4576 enum machine_mode mode = TYPE_MODE (vectype);
4578 /* vect_permute_load_chain requires the group size to be a power of two. */
4579 if (exact_log2 (count) == -1)
4581 if (dump_enabled_p ())
4582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4583 "the size of the group of accesses"
4584 " is not a power of 2");
4585 return false;
4588 /* Check that the permutation is supported. */
4589 if (VECTOR_MODE_P (mode))
4591 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4592 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4594 for (i = 0; i < nelt; i++)
4595 sel[i] = i * 2;
4596 if (can_vec_perm_p (mode, false, sel))
4598 for (i = 0; i < nelt; i++)
4599 sel[i] = i * 2 + 1;
4600 if (can_vec_perm_p (mode, false, sel))
4601 return true;
4605 if (dump_enabled_p ())
4606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4607 "extract even/odd not supported by target");
4608 return false;
4611 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4612 type VECTYPE. */
4614 bool
4615 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4617 return vect_lanes_optab_supported_p ("vec_load_lanes",
4618 vec_load_lanes_optab,
4619 vectype, count);
4622 /* Function vect_permute_load_chain.
4624 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4625 a power of 2, generate extract_even/odd stmts to reorder the input data
4626 correctly. Return the final references for loads in RESULT_CHAIN.
4628 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4629 The input is 4 vectors each containing 8 elements. We assign a number to each
4630 element, the input sequence is:
4632 1st vec: 0 1 2 3 4 5 6 7
4633 2nd vec: 8 9 10 11 12 13 14 15
4634 3rd vec: 16 17 18 19 20 21 22 23
4635 4th vec: 24 25 26 27 28 29 30 31
4637 The output sequence should be:
4639 1st vec: 0 4 8 12 16 20 24 28
4640 2nd vec: 1 5 9 13 17 21 25 29
4641 3rd vec: 2 6 10 14 18 22 26 30
4642 4th vec: 3 7 11 15 19 23 27 31
4644 i.e., the first output vector should contain the first elements of each
4645 interleaving group, etc.
4647 We use extract_even/odd instructions to create such output. The input of
4648 each extract_even/odd operation is two vectors
4649 1st vec 2nd vec
4650 0 1 2 3 4 5 6 7
4652 and the output is the vector of extracted even/odd elements. The output of
4653 extract_even will be: 0 2 4 6
4654 and of extract_odd: 1 3 5 7
4657 The permutation is done in log LENGTH stages. In each stage extract_even
4658 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4659 their order. In our example,
4661 E1: extract_even (1st vec, 2nd vec)
4662 E2: extract_odd (1st vec, 2nd vec)
4663 E3: extract_even (3rd vec, 4th vec)
4664 E4: extract_odd (3rd vec, 4th vec)
4666 The output for the first stage will be:
4668 E1: 0 2 4 6 8 10 12 14
4669 E2: 1 3 5 7 9 11 13 15
4670 E3: 16 18 20 22 24 26 28 30
4671 E4: 17 19 21 23 25 27 29 31
4673 In order to proceed and create the correct sequence for the next stage (or
4674 for the correct output, if the second stage is the last one, as in our
4675 example), we first put the output of extract_even operation and then the
4676 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4677 The input for the second stage is:
4679 1st vec (E1): 0 2 4 6 8 10 12 14
4680 2nd vec (E3): 16 18 20 22 24 26 28 30
4681 3rd vec (E2): 1 3 5 7 9 11 13 15
4682 4th vec (E4): 17 19 21 23 25 27 29 31
4684 The output of the second stage:
4686 E1: 0 4 8 12 16 20 24 28
4687 E2: 2 6 10 14 18 22 26 30
4688 E3: 1 5 9 13 17 21 25 29
4689 E4: 3 7 11 15 19 23 27 31
4691 And RESULT_CHAIN after reordering:
4693 1st vec (E1): 0 4 8 12 16 20 24 28
4694 2nd vec (E3): 1 5 9 13 17 21 25 29
4695 3rd vec (E2): 2 6 10 14 18 22 26 30
4696 4th vec (E4): 3 7 11 15 19 23 27 31. */
4698 static void
4699 vect_permute_load_chain (vec<tree> dr_chain,
4700 unsigned int length,
4701 gimple stmt,
4702 gimple_stmt_iterator *gsi,
4703 vec<tree> *result_chain)
4705 tree data_ref, first_vect, second_vect;
4706 tree perm_mask_even, perm_mask_odd;
4707 gimple perm_stmt;
4708 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4709 unsigned int i, j, log_length = exact_log2 (length);
4710 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4711 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4713 result_chain->quick_grow (length);
4714 memcpy (result_chain->address (), dr_chain.address (),
4715 length * sizeof (tree));
4717 for (i = 0; i < nelt; ++i)
4718 sel[i] = i * 2;
4719 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4720 gcc_assert (perm_mask_even != NULL);
4722 for (i = 0; i < nelt; ++i)
4723 sel[i] = i * 2 + 1;
4724 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4725 gcc_assert (perm_mask_odd != NULL);
4727 for (i = 0; i < log_length; i++)
4729 for (j = 0; j < length; j += 2)
4731 first_vect = dr_chain[j];
4732 second_vect = dr_chain[j+1];
4734 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4735 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4736 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4737 first_vect, second_vect,
4738 perm_mask_even);
4739 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4740 (*result_chain)[j/2] = data_ref;
4742 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4743 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4744 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4745 first_vect, second_vect,
4746 perm_mask_odd);
4747 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4748 (*result_chain)[j/2+length/2] = data_ref;
4750 memcpy (dr_chain.address (), result_chain->address (),
4751 length * sizeof (tree));
4756 /* Function vect_transform_grouped_load.
4758 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4759 to perform their permutation and ascribe the result vectorized statements to
4760 the scalar statements.
4763 void
4764 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4765 gimple_stmt_iterator *gsi)
4767 vec<tree> result_chain = vNULL;
4769 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4770 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4771 vectors, that are ready for vector computation. */
4772 result_chain.create (size);
4773 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4774 vect_record_grouped_load_vectors (stmt, result_chain);
4775 result_chain.release ();
4778 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4779 generated as part of the vectorization of STMT. Assign the statement
4780 for each vector to the associated scalar statement. */
4782 void
4783 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4785 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4786 gimple next_stmt, new_stmt;
4787 unsigned int i, gap_count;
4788 tree tmp_data_ref;
4790 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4791 Since we scan the chain starting from it's first node, their order
4792 corresponds the order of data-refs in RESULT_CHAIN. */
4793 next_stmt = first_stmt;
4794 gap_count = 1;
4795 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4797 if (!next_stmt)
4798 break;
4800 /* Skip the gaps. Loads created for the gaps will be removed by dead
4801 code elimination pass later. No need to check for the first stmt in
4802 the group, since it always exists.
4803 GROUP_GAP is the number of steps in elements from the previous
4804 access (if there is no gap GROUP_GAP is 1). We skip loads that
4805 correspond to the gaps. */
4806 if (next_stmt != first_stmt
4807 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4809 gap_count++;
4810 continue;
4813 while (next_stmt)
4815 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4816 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4817 copies, and we put the new vector statement in the first available
4818 RELATED_STMT. */
4819 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4820 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4821 else
4823 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4825 gimple prev_stmt =
4826 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4827 gimple rel_stmt =
4828 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4829 while (rel_stmt)
4831 prev_stmt = rel_stmt;
4832 rel_stmt =
4833 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4836 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4837 new_stmt;
4841 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4842 gap_count = 1;
4843 /* If NEXT_STMT accesses the same DR as the previous statement,
4844 put the same TMP_DATA_REF as its vectorized statement; otherwise
4845 get the next data-ref from RESULT_CHAIN. */
4846 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4847 break;
4852 /* Function vect_force_dr_alignment_p.
4854 Returns whether the alignment of a DECL can be forced to be aligned
4855 on ALIGNMENT bit boundary. */
4857 bool
4858 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4860 if (TREE_CODE (decl) != VAR_DECL)
4861 return false;
4863 /* We cannot change alignment of common or external symbols as another
4864 translation unit may contain a definition with lower alignment.
4865 The rules of common symbol linking mean that the definition
4866 will override the common symbol. The same is true for constant
4867 pool entries which may be shared and are not properly merged
4868 by LTO. */
4869 if (DECL_EXTERNAL (decl)
4870 || DECL_COMMON (decl)
4871 || DECL_IN_CONSTANT_POOL (decl))
4872 return false;
4874 if (TREE_ASM_WRITTEN (decl))
4875 return false;
4877 /* Do not override the alignment as specified by the ABI when the used
4878 attribute is set. */
4879 if (DECL_PRESERVE_P (decl))
4880 return false;
4882 /* Do not override explicit alignment set by the user when an explicit
4883 section name is also used. This is a common idiom used by many
4884 software projects. */
4885 if (DECL_SECTION_NAME (decl) != NULL_TREE
4886 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4887 return false;
4889 if (TREE_STATIC (decl))
4890 return (alignment <= MAX_OFILE_ALIGNMENT);
4891 else
4892 return (alignment <= MAX_STACK_ALIGNMENT);
4896 /* Return whether the data reference DR is supported with respect to its
4897 alignment.
4898 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4899 it is aligned, i.e., check if it is possible to vectorize it with different
4900 alignment. */
4902 enum dr_alignment_support
4903 vect_supportable_dr_alignment (struct data_reference *dr,
4904 bool check_aligned_accesses)
4906 gimple stmt = DR_STMT (dr);
4907 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4908 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4909 enum machine_mode mode = TYPE_MODE (vectype);
4910 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4911 struct loop *vect_loop = NULL;
4912 bool nested_in_vect_loop = false;
4914 if (aligned_access_p (dr) && !check_aligned_accesses)
4915 return dr_aligned;
4917 if (loop_vinfo)
4919 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4920 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4923 /* Possibly unaligned access. */
4925 /* We can choose between using the implicit realignment scheme (generating
4926 a misaligned_move stmt) and the explicit realignment scheme (generating
4927 aligned loads with a REALIGN_LOAD). There are two variants to the
4928 explicit realignment scheme: optimized, and unoptimized.
4929 We can optimize the realignment only if the step between consecutive
4930 vector loads is equal to the vector size. Since the vector memory
4931 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4932 is guaranteed that the misalignment amount remains the same throughout the
4933 execution of the vectorized loop. Therefore, we can create the
4934 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4935 at the loop preheader.
4937 However, in the case of outer-loop vectorization, when vectorizing a
4938 memory access in the inner-loop nested within the LOOP that is now being
4939 vectorized, while it is guaranteed that the misalignment of the
4940 vectorized memory access will remain the same in different outer-loop
4941 iterations, it is *not* guaranteed that is will remain the same throughout
4942 the execution of the inner-loop. This is because the inner-loop advances
4943 with the original scalar step (and not in steps of VS). If the inner-loop
4944 step happens to be a multiple of VS, then the misalignment remains fixed
4945 and we can use the optimized realignment scheme. For example:
4947 for (i=0; i<N; i++)
4948 for (j=0; j<M; j++)
4949 s += a[i+j];
4951 When vectorizing the i-loop in the above example, the step between
4952 consecutive vector loads is 1, and so the misalignment does not remain
4953 fixed across the execution of the inner-loop, and the realignment cannot
4954 be optimized (as illustrated in the following pseudo vectorized loop):
4956 for (i=0; i<N; i+=4)
4957 for (j=0; j<M; j++){
4958 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4959 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4960 // (assuming that we start from an aligned address).
4963 We therefore have to use the unoptimized realignment scheme:
4965 for (i=0; i<N; i+=4)
4966 for (j=k; j<M; j+=4)
4967 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4968 // that the misalignment of the initial address is
4969 // 0).
4971 The loop can then be vectorized as follows:
4973 for (k=0; k<4; k++){
4974 rt = get_realignment_token (&vp[k]);
4975 for (i=0; i<N; i+=4){
4976 v1 = vp[i+k];
4977 for (j=k; j<M; j+=4){
4978 v2 = vp[i+j+VS-1];
4979 va = REALIGN_LOAD <v1,v2,rt>;
4980 vs += va;
4981 v1 = v2;
4984 } */
4986 if (DR_IS_READ (dr))
4988 bool is_packed = false;
4989 tree type = (TREE_TYPE (DR_REF (dr)));
4991 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4992 && (!targetm.vectorize.builtin_mask_for_load
4993 || targetm.vectorize.builtin_mask_for_load ()))
4995 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4996 if ((nested_in_vect_loop
4997 && (TREE_INT_CST_LOW (DR_STEP (dr))
4998 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4999 || !loop_vinfo)
5000 return dr_explicit_realign;
5001 else
5002 return dr_explicit_realign_optimized;
5004 if (!known_alignment_for_access_p (dr))
5005 is_packed = not_size_aligned (DR_REF (dr));
5007 if (targetm.vectorize.
5008 support_vector_misalignment (mode, type,
5009 DR_MISALIGNMENT (dr), is_packed))
5010 /* Can't software pipeline the loads, but can at least do them. */
5011 return dr_unaligned_supported;
5013 else
5015 bool is_packed = false;
5016 tree type = (TREE_TYPE (DR_REF (dr)));
5018 if (!known_alignment_for_access_p (dr))
5019 is_packed = not_size_aligned (DR_REF (dr));
5021 if (targetm.vectorize.
5022 support_vector_misalignment (mode, type,
5023 DR_MISALIGNMENT (dr), is_packed))
5024 return dr_unaligned_supported;
5027 /* Unsupported. */
5028 return dr_unaligned_unsupported;